Root/fs/btrfs/file.c

1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/pagemap.h>
21#include <linux/highmem.h>
22#include <linux/time.h>
23#include <linux/init.h>
24#include <linux/string.h>
25#include <linux/backing-dev.h>
26#include <linux/mpage.h>
27#include <linux/swap.h>
28#include <linux/writeback.h>
29#include <linux/statfs.h>
30#include <linux/compat.h>
31#include <linux/slab.h>
32#include "ctree.h"
33#include "disk-io.h"
34#include "transaction.h"
35#include "btrfs_inode.h"
36#include "ioctl.h"
37#include "print-tree.h"
38#include "tree-log.h"
39#include "locking.h"
40#include "compat.h"
41
42
43/* simple helper to fault in pages and copy. This should go away
44 * and be replaced with calls into generic code.
45 */
46static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
47                     int write_bytes,
48                     struct page **prepared_pages,
49                     const char __user *buf)
50{
51    long page_fault = 0;
52    int i;
53    int offset = pos & (PAGE_CACHE_SIZE - 1);
54
55    for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
56        size_t count = min_t(size_t,
57                     PAGE_CACHE_SIZE - offset, write_bytes);
58        struct page *page = prepared_pages[i];
59        fault_in_pages_readable(buf, count);
60
61        /* Copy data from userspace to the current page */
62        kmap(page);
63        page_fault = __copy_from_user(page_address(page) + offset,
64                          buf, count);
65        /* Flush processor's dcache for this page */
66        flush_dcache_page(page);
67        kunmap(page);
68        buf += count;
69        write_bytes -= count;
70
71        if (page_fault)
72            break;
73    }
74    return page_fault ? -EFAULT : 0;
75}
76
77/*
78 * unlocks pages after btrfs_file_write is done with them
79 */
80static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
81{
82    size_t i;
83    for (i = 0; i < num_pages; i++) {
84        if (!pages[i])
85            break;
86        /* page checked is some magic around finding pages that
87         * have been modified without going through btrfs_set_page_dirty
88         * clear it here
89         */
90        ClearPageChecked(pages[i]);
91        unlock_page(pages[i]);
92        mark_page_accessed(pages[i]);
93        page_cache_release(pages[i]);
94    }
95}
96
97/*
98 * after copy_from_user, pages need to be dirtied and we need to make
99 * sure holes are created between the current EOF and the start of
100 * any next extents (if required).
101 *
102 * this also makes the decision about creating an inline extent vs
103 * doing real data extents, marking pages dirty and delalloc as required.
104 */
105static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
106                   struct btrfs_root *root,
107                   struct file *file,
108                   struct page **pages,
109                   size_t num_pages,
110                   loff_t pos,
111                   size_t write_bytes)
112{
113    int err = 0;
114    int i;
115    struct inode *inode = fdentry(file)->d_inode;
116    u64 num_bytes;
117    u64 start_pos;
118    u64 end_of_last_block;
119    u64 end_pos = pos + write_bytes;
120    loff_t isize = i_size_read(inode);
121
122    start_pos = pos & ~((u64)root->sectorsize - 1);
123    num_bytes = (write_bytes + pos - start_pos +
124            root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
125
126    end_of_last_block = start_pos + num_bytes - 1;
127    err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
128                    NULL);
129    if (err)
130        return err;
131
132    for (i = 0; i < num_pages; i++) {
133        struct page *p = pages[i];
134        SetPageUptodate(p);
135        ClearPageChecked(p);
136        set_page_dirty(p);
137    }
138    if (end_pos > isize) {
139        i_size_write(inode, end_pos);
140        /* we've only changed i_size in ram, and we haven't updated
141         * the disk i_size. There is no need to log the inode
142         * at this time.
143         */
144    }
145    return err;
146}
147
148/*
149 * this drops all the extents in the cache that intersect the range
150 * [start, end]. Existing extents are split as required.
151 */
152int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
153                int skip_pinned)
154{
155    struct extent_map *em;
156    struct extent_map *split = NULL;
157    struct extent_map *split2 = NULL;
158    struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
159    u64 len = end - start + 1;
160    int ret;
161    int testend = 1;
162    unsigned long flags;
163    int compressed = 0;
164
165    WARN_ON(end < start);
166    if (end == (u64)-1) {
167        len = (u64)-1;
168        testend = 0;
169    }
170    while (1) {
171        if (!split)
172            split = alloc_extent_map(GFP_NOFS);
173        if (!split2)
174            split2 = alloc_extent_map(GFP_NOFS);
175
176        write_lock(&em_tree->lock);
177        em = lookup_extent_mapping(em_tree, start, len);
178        if (!em) {
179            write_unlock(&em_tree->lock);
180            break;
181        }
182        flags = em->flags;
183        if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
184            if (testend && em->start + em->len >= start + len) {
185                free_extent_map(em);
186                write_unlock(&em_tree->lock);
187                break;
188            }
189            start = em->start + em->len;
190            if (testend)
191                len = start + len - (em->start + em->len);
192            free_extent_map(em);
193            write_unlock(&em_tree->lock);
194            continue;
195        }
196        compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
197        clear_bit(EXTENT_FLAG_PINNED, &em->flags);
198        remove_extent_mapping(em_tree, em);
199
200        if (em->block_start < EXTENT_MAP_LAST_BYTE &&
201            em->start < start) {
202            split->start = em->start;
203            split->len = start - em->start;
204            split->orig_start = em->orig_start;
205            split->block_start = em->block_start;
206
207            if (compressed)
208                split->block_len = em->block_len;
209            else
210                split->block_len = split->len;
211
212            split->bdev = em->bdev;
213            split->flags = flags;
214            ret = add_extent_mapping(em_tree, split);
215            BUG_ON(ret);
216            free_extent_map(split);
217            split = split2;
218            split2 = NULL;
219        }
220        if (em->block_start < EXTENT_MAP_LAST_BYTE &&
221            testend && em->start + em->len > start + len) {
222            u64 diff = start + len - em->start;
223
224            split->start = start + len;
225            split->len = em->start + em->len - (start + len);
226            split->bdev = em->bdev;
227            split->flags = flags;
228
229            if (compressed) {
230                split->block_len = em->block_len;
231                split->block_start = em->block_start;
232                split->orig_start = em->orig_start;
233            } else {
234                split->block_len = split->len;
235                split->block_start = em->block_start + diff;
236                split->orig_start = split->start;
237            }
238
239            ret = add_extent_mapping(em_tree, split);
240            BUG_ON(ret);
241            free_extent_map(split);
242            split = NULL;
243        }
244        write_unlock(&em_tree->lock);
245
246        /* once for us */
247        free_extent_map(em);
248        /* once for the tree*/
249        free_extent_map(em);
250    }
251    if (split)
252        free_extent_map(split);
253    if (split2)
254        free_extent_map(split2);
255    return 0;
256}
257
258/*
259 * this is very complex, but the basic idea is to drop all extents
260 * in the range start - end. hint_block is filled in with a block number
261 * that would be a good hint to the block allocator for this file.
262 *
263 * If an extent intersects the range but is not entirely inside the range
264 * it is either truncated or split. Anything entirely inside the range
265 * is deleted from the tree.
266 */
267int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
268               u64 start, u64 end, u64 *hint_byte, int drop_cache)
269{
270    struct btrfs_root *root = BTRFS_I(inode)->root;
271    struct extent_buffer *leaf;
272    struct btrfs_file_extent_item *fi;
273    struct btrfs_path *path;
274    struct btrfs_key key;
275    struct btrfs_key new_key;
276    u64 search_start = start;
277    u64 disk_bytenr = 0;
278    u64 num_bytes = 0;
279    u64 extent_offset = 0;
280    u64 extent_end = 0;
281    int del_nr = 0;
282    int del_slot = 0;
283    int extent_type;
284    int recow;
285    int ret;
286
287    if (drop_cache)
288        btrfs_drop_extent_cache(inode, start, end - 1, 0);
289
290    path = btrfs_alloc_path();
291    if (!path)
292        return -ENOMEM;
293
294    while (1) {
295        recow = 0;
296        ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
297                           search_start, -1);
298        if (ret < 0)
299            break;
300        if (ret > 0 && path->slots[0] > 0 && search_start == start) {
301            leaf = path->nodes[0];
302            btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
303            if (key.objectid == inode->i_ino &&
304                key.type == BTRFS_EXTENT_DATA_KEY)
305                path->slots[0]--;
306        }
307        ret = 0;
308next_slot:
309        leaf = path->nodes[0];
310        if (path->slots[0] >= btrfs_header_nritems(leaf)) {
311            BUG_ON(del_nr > 0);
312            ret = btrfs_next_leaf(root, path);
313            if (ret < 0)
314                break;
315            if (ret > 0) {
316                ret = 0;
317                break;
318            }
319            leaf = path->nodes[0];
320            recow = 1;
321        }
322
323        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
324        if (key.objectid > inode->i_ino ||
325            key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
326            break;
327
328        fi = btrfs_item_ptr(leaf, path->slots[0],
329                    struct btrfs_file_extent_item);
330        extent_type = btrfs_file_extent_type(leaf, fi);
331
332        if (extent_type == BTRFS_FILE_EXTENT_REG ||
333            extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
334            disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
335            num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
336            extent_offset = btrfs_file_extent_offset(leaf, fi);
337            extent_end = key.offset +
338                btrfs_file_extent_num_bytes(leaf, fi);
339        } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
340            extent_end = key.offset +
341                btrfs_file_extent_inline_len(leaf, fi);
342        } else {
343            WARN_ON(1);
344            extent_end = search_start;
345        }
346
347        if (extent_end <= search_start) {
348            path->slots[0]++;
349            goto next_slot;
350        }
351
352        search_start = max(key.offset, start);
353        if (recow) {
354            btrfs_release_path(root, path);
355            continue;
356        }
357
358        /*
359         * | - range to drop - |
360         * | -------- extent -------- |
361         */
362        if (start > key.offset && end < extent_end) {
363            BUG_ON(del_nr > 0);
364            BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
365
366            memcpy(&new_key, &key, sizeof(new_key));
367            new_key.offset = start;
368            ret = btrfs_duplicate_item(trans, root, path,
369                           &new_key);
370            if (ret == -EAGAIN) {
371                btrfs_release_path(root, path);
372                continue;
373            }
374            if (ret < 0)
375                break;
376
377            leaf = path->nodes[0];
378            fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
379                        struct btrfs_file_extent_item);
380            btrfs_set_file_extent_num_bytes(leaf, fi,
381                            start - key.offset);
382
383            fi = btrfs_item_ptr(leaf, path->slots[0],
384                        struct btrfs_file_extent_item);
385
386            extent_offset += start - key.offset;
387            btrfs_set_file_extent_offset(leaf, fi, extent_offset);
388            btrfs_set_file_extent_num_bytes(leaf, fi,
389                            extent_end - start);
390            btrfs_mark_buffer_dirty(leaf);
391
392            if (disk_bytenr > 0) {
393                ret = btrfs_inc_extent_ref(trans, root,
394                        disk_bytenr, num_bytes, 0,
395                        root->root_key.objectid,
396                        new_key.objectid,
397                        start - extent_offset);
398                BUG_ON(ret);
399                *hint_byte = disk_bytenr;
400            }
401            key.offset = start;
402        }
403        /*
404         * | ---- range to drop ----- |
405         * | -------- extent -------- |
406         */
407        if (start <= key.offset && end < extent_end) {
408            BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
409
410            memcpy(&new_key, &key, sizeof(new_key));
411            new_key.offset = end;
412            btrfs_set_item_key_safe(trans, root, path, &new_key);
413
414            extent_offset += end - key.offset;
415            btrfs_set_file_extent_offset(leaf, fi, extent_offset);
416            btrfs_set_file_extent_num_bytes(leaf, fi,
417                            extent_end - end);
418            btrfs_mark_buffer_dirty(leaf);
419            if (disk_bytenr > 0) {
420                inode_sub_bytes(inode, end - key.offset);
421                *hint_byte = disk_bytenr;
422            }
423            break;
424        }
425
426        search_start = extent_end;
427        /*
428         * | ---- range to drop ----- |
429         * | -------- extent -------- |
430         */
431        if (start > key.offset && end >= extent_end) {
432            BUG_ON(del_nr > 0);
433            BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
434
435            btrfs_set_file_extent_num_bytes(leaf, fi,
436                            start - key.offset);
437            btrfs_mark_buffer_dirty(leaf);
438            if (disk_bytenr > 0) {
439                inode_sub_bytes(inode, extent_end - start);
440                *hint_byte = disk_bytenr;
441            }
442            if (end == extent_end)
443                break;
444
445            path->slots[0]++;
446            goto next_slot;
447        }
448
449        /*
450         * | ---- range to drop ----- |
451         * | ------ extent ------ |
452         */
453        if (start <= key.offset && end >= extent_end) {
454            if (del_nr == 0) {
455                del_slot = path->slots[0];
456                del_nr = 1;
457            } else {
458                BUG_ON(del_slot + del_nr != path->slots[0]);
459                del_nr++;
460            }
461
462            if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
463                inode_sub_bytes(inode,
464                        extent_end - key.offset);
465                extent_end = ALIGN(extent_end,
466                           root->sectorsize);
467            } else if (disk_bytenr > 0) {
468                ret = btrfs_free_extent(trans, root,
469                        disk_bytenr, num_bytes, 0,
470                        root->root_key.objectid,
471                        key.objectid, key.offset -
472                        extent_offset);
473                BUG_ON(ret);
474                inode_sub_bytes(inode,
475                        extent_end - key.offset);
476                *hint_byte = disk_bytenr;
477            }
478
479            if (end == extent_end)
480                break;
481
482            if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
483                path->slots[0]++;
484                goto next_slot;
485            }
486
487            ret = btrfs_del_items(trans, root, path, del_slot,
488                          del_nr);
489            BUG_ON(ret);
490
491            del_nr = 0;
492            del_slot = 0;
493
494            btrfs_release_path(root, path);
495            continue;
496        }
497
498        BUG_ON(1);
499    }
500
501    if (del_nr > 0) {
502        ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
503        BUG_ON(ret);
504    }
505
506    btrfs_free_path(path);
507    return ret;
508}
509
510static int extent_mergeable(struct extent_buffer *leaf, int slot,
511                u64 objectid, u64 bytenr, u64 orig_offset,
512                u64 *start, u64 *end)
513{
514    struct btrfs_file_extent_item *fi;
515    struct btrfs_key key;
516    u64 extent_end;
517
518    if (slot < 0 || slot >= btrfs_header_nritems(leaf))
519        return 0;
520
521    btrfs_item_key_to_cpu(leaf, &key, slot);
522    if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
523        return 0;
524
525    fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
526    if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
527        btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
528        btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
529        btrfs_file_extent_compression(leaf, fi) ||
530        btrfs_file_extent_encryption(leaf, fi) ||
531        btrfs_file_extent_other_encoding(leaf, fi))
532        return 0;
533
534    extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
535    if ((*start && *start != key.offset) || (*end && *end != extent_end))
536        return 0;
537
538    *start = key.offset;
539    *end = extent_end;
540    return 1;
541}
542
543/*
544 * Mark extent in the range start - end as written.
545 *
546 * This changes extent type from 'pre-allocated' to 'regular'. If only
547 * part of extent is marked as written, the extent will be split into
548 * two or three.
549 */
550int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
551                  struct inode *inode, u64 start, u64 end)
552{
553    struct btrfs_root *root = BTRFS_I(inode)->root;
554    struct extent_buffer *leaf;
555    struct btrfs_path *path;
556    struct btrfs_file_extent_item *fi;
557    struct btrfs_key key;
558    struct btrfs_key new_key;
559    u64 bytenr;
560    u64 num_bytes;
561    u64 extent_end;
562    u64 orig_offset;
563    u64 other_start;
564    u64 other_end;
565    u64 split;
566    int del_nr = 0;
567    int del_slot = 0;
568    int recow;
569    int ret;
570
571    btrfs_drop_extent_cache(inode, start, end - 1, 0);
572
573    path = btrfs_alloc_path();
574    BUG_ON(!path);
575again:
576    recow = 0;
577    split = start;
578    key.objectid = inode->i_ino;
579    key.type = BTRFS_EXTENT_DATA_KEY;
580    key.offset = split;
581
582    ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
583    if (ret > 0 && path->slots[0] > 0)
584        path->slots[0]--;
585
586    leaf = path->nodes[0];
587    btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
588    BUG_ON(key.objectid != inode->i_ino ||
589           key.type != BTRFS_EXTENT_DATA_KEY);
590    fi = btrfs_item_ptr(leaf, path->slots[0],
591                struct btrfs_file_extent_item);
592    BUG_ON(btrfs_file_extent_type(leaf, fi) !=
593           BTRFS_FILE_EXTENT_PREALLOC);
594    extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
595    BUG_ON(key.offset > start || extent_end < end);
596
597    bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
598    num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
599    orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
600    memcpy(&new_key, &key, sizeof(new_key));
601
602    if (start == key.offset && end < extent_end) {
603        other_start = 0;
604        other_end = start;
605        if (extent_mergeable(leaf, path->slots[0] - 1,
606                     inode->i_ino, bytenr, orig_offset,
607                     &other_start, &other_end)) {
608            new_key.offset = end;
609            btrfs_set_item_key_safe(trans, root, path, &new_key);
610            fi = btrfs_item_ptr(leaf, path->slots[0],
611                        struct btrfs_file_extent_item);
612            btrfs_set_file_extent_num_bytes(leaf, fi,
613                            extent_end - end);
614            btrfs_set_file_extent_offset(leaf, fi,
615                             end - orig_offset);
616            fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
617                        struct btrfs_file_extent_item);
618            btrfs_set_file_extent_num_bytes(leaf, fi,
619                            end - other_start);
620            btrfs_mark_buffer_dirty(leaf);
621            goto out;
622        }
623    }
624
625    if (start > key.offset && end == extent_end) {
626        other_start = end;
627        other_end = 0;
628        if (extent_mergeable(leaf, path->slots[0] + 1,
629                     inode->i_ino, bytenr, orig_offset,
630                     &other_start, &other_end)) {
631            fi = btrfs_item_ptr(leaf, path->slots[0],
632                        struct btrfs_file_extent_item);
633            btrfs_set_file_extent_num_bytes(leaf, fi,
634                            start - key.offset);
635            path->slots[0]++;
636            new_key.offset = start;
637            btrfs_set_item_key_safe(trans, root, path, &new_key);
638
639            fi = btrfs_item_ptr(leaf, path->slots[0],
640                        struct btrfs_file_extent_item);
641            btrfs_set_file_extent_num_bytes(leaf, fi,
642                            other_end - start);
643            btrfs_set_file_extent_offset(leaf, fi,
644                             start - orig_offset);
645            btrfs_mark_buffer_dirty(leaf);
646            goto out;
647        }
648    }
649
650    while (start > key.offset || end < extent_end) {
651        if (key.offset == start)
652            split = end;
653
654        new_key.offset = split;
655        ret = btrfs_duplicate_item(trans, root, path, &new_key);
656        if (ret == -EAGAIN) {
657            btrfs_release_path(root, path);
658            goto again;
659        }
660        BUG_ON(ret < 0);
661
662        leaf = path->nodes[0];
663        fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
664                    struct btrfs_file_extent_item);
665        btrfs_set_file_extent_num_bytes(leaf, fi,
666                        split - key.offset);
667
668        fi = btrfs_item_ptr(leaf, path->slots[0],
669                    struct btrfs_file_extent_item);
670
671        btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
672        btrfs_set_file_extent_num_bytes(leaf, fi,
673                        extent_end - split);
674        btrfs_mark_buffer_dirty(leaf);
675
676        ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
677                       root->root_key.objectid,
678                       inode->i_ino, orig_offset);
679        BUG_ON(ret);
680
681        if (split == start) {
682            key.offset = start;
683        } else {
684            BUG_ON(start != key.offset);
685            path->slots[0]--;
686            extent_end = end;
687        }
688        recow = 1;
689    }
690
691    other_start = end;
692    other_end = 0;
693    if (extent_mergeable(leaf, path->slots[0] + 1,
694                 inode->i_ino, bytenr, orig_offset,
695                 &other_start, &other_end)) {
696        if (recow) {
697            btrfs_release_path(root, path);
698            goto again;
699        }
700        extent_end = other_end;
701        del_slot = path->slots[0] + 1;
702        del_nr++;
703        ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
704                    0, root->root_key.objectid,
705                    inode->i_ino, orig_offset);
706        BUG_ON(ret);
707    }
708    other_start = 0;
709    other_end = start;
710    if (extent_mergeable(leaf, path->slots[0] - 1,
711                 inode->i_ino, bytenr, orig_offset,
712                 &other_start, &other_end)) {
713        if (recow) {
714            btrfs_release_path(root, path);
715            goto again;
716        }
717        key.offset = other_start;
718        del_slot = path->slots[0];
719        del_nr++;
720        ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
721                    0, root->root_key.objectid,
722                    inode->i_ino, orig_offset);
723        BUG_ON(ret);
724    }
725    if (del_nr == 0) {
726        fi = btrfs_item_ptr(leaf, path->slots[0],
727               struct btrfs_file_extent_item);
728        btrfs_set_file_extent_type(leaf, fi,
729                       BTRFS_FILE_EXTENT_REG);
730        btrfs_mark_buffer_dirty(leaf);
731    } else {
732        fi = btrfs_item_ptr(leaf, del_slot - 1,
733               struct btrfs_file_extent_item);
734        btrfs_set_file_extent_type(leaf, fi,
735                       BTRFS_FILE_EXTENT_REG);
736        btrfs_set_file_extent_num_bytes(leaf, fi,
737                        extent_end - key.offset);
738        btrfs_mark_buffer_dirty(leaf);
739
740        ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
741        BUG_ON(ret);
742    }
743out:
744    btrfs_free_path(path);
745    return 0;
746}
747
748/*
749 * this gets pages into the page cache and locks them down, it also properly
750 * waits for data=ordered extents to finish before allowing the pages to be
751 * modified.
752 */
753static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
754             struct page **pages, size_t num_pages,
755             loff_t pos, unsigned long first_index,
756             unsigned long last_index, size_t write_bytes)
757{
758    struct extent_state *cached_state = NULL;
759    int i;
760    unsigned long index = pos >> PAGE_CACHE_SHIFT;
761    struct inode *inode = fdentry(file)->d_inode;
762    int err = 0;
763    u64 start_pos;
764    u64 last_pos;
765
766    start_pos = pos & ~((u64)root->sectorsize - 1);
767    last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
768
769    if (start_pos > inode->i_size) {
770        err = btrfs_cont_expand(inode, start_pos);
771        if (err)
772            return err;
773    }
774
775    memset(pages, 0, num_pages * sizeof(struct page *));
776again:
777    for (i = 0; i < num_pages; i++) {
778        pages[i] = grab_cache_page(inode->i_mapping, index + i);
779        if (!pages[i]) {
780            err = -ENOMEM;
781            BUG_ON(1);
782        }
783        wait_on_page_writeback(pages[i]);
784    }
785    if (start_pos < inode->i_size) {
786        struct btrfs_ordered_extent *ordered;
787        lock_extent_bits(&BTRFS_I(inode)->io_tree,
788                 start_pos, last_pos - 1, 0, &cached_state,
789                 GFP_NOFS);
790        ordered = btrfs_lookup_first_ordered_extent(inode,
791                                last_pos - 1);
792        if (ordered &&
793            ordered->file_offset + ordered->len > start_pos &&
794            ordered->file_offset < last_pos) {
795            btrfs_put_ordered_extent(ordered);
796            unlock_extent_cached(&BTRFS_I(inode)->io_tree,
797                         start_pos, last_pos - 1,
798                         &cached_state, GFP_NOFS);
799            for (i = 0; i < num_pages; i++) {
800                unlock_page(pages[i]);
801                page_cache_release(pages[i]);
802            }
803            btrfs_wait_ordered_range(inode, start_pos,
804                         last_pos - start_pos);
805            goto again;
806        }
807        if (ordered)
808            btrfs_put_ordered_extent(ordered);
809
810        clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
811                  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
812                  EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
813                  GFP_NOFS);
814        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
815                     start_pos, last_pos - 1, &cached_state,
816                     GFP_NOFS);
817    }
818    for (i = 0; i < num_pages; i++) {
819        clear_page_dirty_for_io(pages[i]);
820        set_page_extent_mapped(pages[i]);
821        WARN_ON(!PageLocked(pages[i]));
822    }
823    return 0;
824}
825
826static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
827                size_t count, loff_t *ppos)
828{
829    loff_t pos;
830    loff_t start_pos;
831    ssize_t num_written = 0;
832    ssize_t err = 0;
833    int ret = 0;
834    struct inode *inode = fdentry(file)->d_inode;
835    struct btrfs_root *root = BTRFS_I(inode)->root;
836    struct page **pages = NULL;
837    int nrptrs;
838    struct page *pinned[2];
839    unsigned long first_index;
840    unsigned long last_index;
841    int will_write;
842
843    will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
844              (file->f_flags & O_DIRECT));
845
846    nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
847             PAGE_CACHE_SIZE / (sizeof(struct page *)));
848    pinned[0] = NULL;
849    pinned[1] = NULL;
850
851    pos = *ppos;
852    start_pos = pos;
853
854    vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
855
856    /* do the reserve before the mutex lock in case we have to do some
857     * flushing. We wouldn't deadlock, but this is more polite.
858     */
859    err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
860    if (err)
861        goto out_nolock;
862
863    mutex_lock(&inode->i_mutex);
864
865    current->backing_dev_info = inode->i_mapping->backing_dev_info;
866    err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
867    if (err)
868        goto out;
869
870    if (count == 0)
871        goto out;
872
873    err = file_remove_suid(file);
874    if (err)
875        goto out;
876
877    file_update_time(file);
878
879    pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
880
881    /* generic_write_checks can change our pos */
882    start_pos = pos;
883
884    BTRFS_I(inode)->sequence++;
885    first_index = pos >> PAGE_CACHE_SHIFT;
886    last_index = (pos + count) >> PAGE_CACHE_SHIFT;
887
888    /*
889     * there are lots of better ways to do this, but this code
890     * makes sure the first and last page in the file range are
891     * up to date and ready for cow
892     */
893    if ((pos & (PAGE_CACHE_SIZE - 1))) {
894        pinned[0] = grab_cache_page(inode->i_mapping, first_index);
895        if (!PageUptodate(pinned[0])) {
896            ret = btrfs_readpage(NULL, pinned[0]);
897            BUG_ON(ret);
898            wait_on_page_locked(pinned[0]);
899        } else {
900            unlock_page(pinned[0]);
901        }
902    }
903    if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
904        pinned[1] = grab_cache_page(inode->i_mapping, last_index);
905        if (!PageUptodate(pinned[1])) {
906            ret = btrfs_readpage(NULL, pinned[1]);
907            BUG_ON(ret);
908            wait_on_page_locked(pinned[1]);
909        } else {
910            unlock_page(pinned[1]);
911        }
912    }
913
914    while (count > 0) {
915        size_t offset = pos & (PAGE_CACHE_SIZE - 1);
916        size_t write_bytes = min(count, nrptrs *
917                    (size_t)PAGE_CACHE_SIZE -
918                     offset);
919        size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
920                    PAGE_CACHE_SHIFT;
921
922        WARN_ON(num_pages > nrptrs);
923        memset(pages, 0, sizeof(struct page *) * nrptrs);
924
925        ret = btrfs_check_data_free_space(root, inode, write_bytes);
926        if (ret)
927            goto out;
928
929        ret = prepare_pages(root, file, pages, num_pages,
930                    pos, first_index, last_index,
931                    write_bytes);
932        if (ret) {
933            btrfs_free_reserved_data_space(root, inode,
934                               write_bytes);
935            goto out;
936        }
937
938        ret = btrfs_copy_from_user(pos, num_pages,
939                       write_bytes, pages, buf);
940        if (ret) {
941            btrfs_free_reserved_data_space(root, inode,
942                               write_bytes);
943            btrfs_drop_pages(pages, num_pages);
944            goto out;
945        }
946
947        ret = dirty_and_release_pages(NULL, root, file, pages,
948                          num_pages, pos, write_bytes);
949        btrfs_drop_pages(pages, num_pages);
950        if (ret) {
951            btrfs_free_reserved_data_space(root, inode,
952                               write_bytes);
953            goto out;
954        }
955
956        if (will_write) {
957            filemap_fdatawrite_range(inode->i_mapping, pos,
958                         pos + write_bytes - 1);
959        } else {
960            balance_dirty_pages_ratelimited_nr(inode->i_mapping,
961                               num_pages);
962            if (num_pages <
963                (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
964                btrfs_btree_balance_dirty(root, 1);
965            btrfs_throttle(root);
966        }
967
968        buf += write_bytes;
969        count -= write_bytes;
970        pos += write_bytes;
971        num_written += write_bytes;
972
973        cond_resched();
974    }
975out:
976    mutex_unlock(&inode->i_mutex);
977    if (ret)
978        err = ret;
979    btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
980
981out_nolock:
982    kfree(pages);
983    if (pinned[0])
984        page_cache_release(pinned[0]);
985    if (pinned[1])
986        page_cache_release(pinned[1]);
987    *ppos = pos;
988
989    /*
990     * we want to make sure fsync finds this change
991     * but we haven't joined a transaction running right now.
992     *
993     * Later on, someone is sure to update the inode and get the
994     * real transid recorded.
995     *
996     * We set last_trans now to the fs_info generation + 1,
997     * this will either be one more than the running transaction
998     * or the generation used for the next transaction if there isn't
999     * one running right now.
1000     */
1001    BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1002
1003    if (num_written > 0 && will_write) {
1004        struct btrfs_trans_handle *trans;
1005
1006        err = btrfs_wait_ordered_range(inode, start_pos, num_written);
1007        if (err)
1008            num_written = err;
1009
1010        if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
1011            trans = btrfs_start_transaction(root, 1);
1012            ret = btrfs_log_dentry_safe(trans, root,
1013                            file->f_dentry);
1014            if (ret == 0) {
1015                ret = btrfs_sync_log(trans, root);
1016                if (ret == 0)
1017                    btrfs_end_transaction(trans, root);
1018                else
1019                    btrfs_commit_transaction(trans, root);
1020            } else if (ret != BTRFS_NO_LOG_SYNC) {
1021                btrfs_commit_transaction(trans, root);
1022            } else {
1023                btrfs_end_transaction(trans, root);
1024            }
1025        }
1026        if (file->f_flags & O_DIRECT) {
1027            invalidate_mapping_pages(inode->i_mapping,
1028                  start_pos >> PAGE_CACHE_SHIFT,
1029                 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
1030        }
1031    }
1032    current->backing_dev_info = NULL;
1033    return num_written ? num_written : err;
1034}
1035
1036int btrfs_release_file(struct inode *inode, struct file *filp)
1037{
1038    /*
1039     * ordered_data_close is set by settattr when we are about to truncate
1040     * a file from a non-zero size to a zero size. This tries to
1041     * flush down new bytes that may have been written if the
1042     * application were using truncate to replace a file in place.
1043     */
1044    if (BTRFS_I(inode)->ordered_data_close) {
1045        BTRFS_I(inode)->ordered_data_close = 0;
1046        btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
1047        if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1048            filemap_flush(inode->i_mapping);
1049    }
1050    if (filp->private_data)
1051        btrfs_ioctl_trans_end(filp);
1052    return 0;
1053}
1054
1055/*
1056 * fsync call for both files and directories. This logs the inode into
1057 * the tree log instead of forcing full commits whenever possible.
1058 *
1059 * It needs to call filemap_fdatawait so that all ordered extent updates are
1060 * in the metadata btree are up to date for copying to the log.
1061 *
1062 * It drops the inode mutex before doing the tree log commit. This is an
1063 * important optimization for directories because holding the mutex prevents
1064 * new operations on the dir while we write to disk.
1065 */
1066int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1067{
1068    struct inode *inode = dentry->d_inode;
1069    struct btrfs_root *root = BTRFS_I(inode)->root;
1070    int ret = 0;
1071    struct btrfs_trans_handle *trans;
1072
1073
1074    /* we wait first, since the writeback may change the inode */
1075    root->log_batch++;
1076    /* the VFS called filemap_fdatawrite for us */
1077    btrfs_wait_ordered_range(inode, 0, (u64)-1);
1078    root->log_batch++;
1079
1080    /*
1081     * check the transaction that last modified this inode
1082     * and see if its already been committed
1083     */
1084    if (!BTRFS_I(inode)->last_trans)
1085        goto out;
1086
1087    /*
1088     * if the last transaction that changed this file was before
1089     * the current transaction, we can bail out now without any
1090     * syncing
1091     */
1092    mutex_lock(&root->fs_info->trans_mutex);
1093    if (BTRFS_I(inode)->last_trans <=
1094        root->fs_info->last_trans_committed) {
1095        BTRFS_I(inode)->last_trans = 0;
1096        mutex_unlock(&root->fs_info->trans_mutex);
1097        goto out;
1098    }
1099    mutex_unlock(&root->fs_info->trans_mutex);
1100
1101    /*
1102     * ok we haven't committed the transaction yet, lets do a commit
1103     */
1104    if (file && file->private_data)
1105        btrfs_ioctl_trans_end(file);
1106
1107    trans = btrfs_start_transaction(root, 1);
1108    if (!trans) {
1109        ret = -ENOMEM;
1110        goto out;
1111    }
1112
1113    ret = btrfs_log_dentry_safe(trans, root, dentry);
1114    if (ret < 0)
1115        goto out;
1116
1117    /* we've logged all the items and now have a consistent
1118     * version of the file in the log. It is possible that
1119     * someone will come in and modify the file, but that's
1120     * fine because the log is consistent on disk, and we
1121     * have references to all of the file's extents
1122     *
1123     * It is possible that someone will come in and log the
1124     * file again, but that will end up using the synchronization
1125     * inside btrfs_sync_log to keep things safe.
1126     */
1127    mutex_unlock(&dentry->d_inode->i_mutex);
1128
1129    if (ret != BTRFS_NO_LOG_SYNC) {
1130        if (ret > 0) {
1131            ret = btrfs_commit_transaction(trans, root);
1132        } else {
1133            ret = btrfs_sync_log(trans, root);
1134            if (ret == 0)
1135                ret = btrfs_end_transaction(trans, root);
1136            else
1137                ret = btrfs_commit_transaction(trans, root);
1138        }
1139    } else {
1140        ret = btrfs_end_transaction(trans, root);
1141    }
1142    mutex_lock(&dentry->d_inode->i_mutex);
1143out:
1144    return ret > 0 ? -EIO : ret;
1145}
1146
1147static const struct vm_operations_struct btrfs_file_vm_ops = {
1148    .fault = filemap_fault,
1149    .page_mkwrite = btrfs_page_mkwrite,
1150};
1151
1152static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1153{
1154    vma->vm_ops = &btrfs_file_vm_ops;
1155    file_accessed(filp);
1156    return 0;
1157}
1158
1159const struct file_operations btrfs_file_operations = {
1160    .llseek = generic_file_llseek,
1161    .read = do_sync_read,
1162    .aio_read = generic_file_aio_read,
1163    .splice_read = generic_file_splice_read,
1164    .write = btrfs_file_write,
1165    .mmap = btrfs_file_mmap,
1166    .open = generic_file_open,
1167    .release = btrfs_release_file,
1168    .fsync = btrfs_sync_file,
1169    .unlocked_ioctl = btrfs_ioctl,
1170#ifdef CONFIG_COMPAT
1171    .compat_ioctl = btrfs_ioctl,
1172#endif
1173};
1174

Archive Download this file



interactive