Root/fs/splice.c

1/*
2 * "splice": joining two ropes together by interweaving their strands.
3 *
4 * This is the "extended pipe" functionality, where a pipe is used as
5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
6 * buffer that you can use to transfer data from one end to the other.
7 *
8 * The traditional unix read/write is extended with a "splice()" operation
9 * that transfers data buffers to or from a pipe buffer.
10 *
11 * Named by Larry McVoy, original implementation from Linus, extended by
12 * Jens to support splicing to files, network, direct splicing, etc and
13 * fixing lots of bugs.
14 *
15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
18 *
19 */
20#include <linux/fs.h>
21#include <linux/file.h>
22#include <linux/pagemap.h>
23#include <linux/splice.h>
24#include <linux/memcontrol.h>
25#include <linux/mm_inline.h>
26#include <linux/swap.h>
27#include <linux/writeback.h>
28#include <linux/buffer_head.h>
29#include <linux/module.h>
30#include <linux/syscalls.h>
31#include <linux/uio.h>
32#include <linux/security.h>
33#include <linux/gfp.h>
34
35/*
36 * Attempt to steal a page from a pipe buffer. This should perhaps go into
37 * a vm helper function, it's already simplified quite a bit by the
38 * addition of remove_mapping(). If success is returned, the caller may
39 * attempt to reuse this page for another destination.
40 */
41static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
42                     struct pipe_buffer *buf)
43{
44    struct page *page = buf->page;
45    struct address_space *mapping;
46
47    lock_page(page);
48
49    mapping = page_mapping(page);
50    if (mapping) {
51        WARN_ON(!PageUptodate(page));
52
53        /*
54         * At least for ext2 with nobh option, we need to wait on
55         * writeback completing on this page, since we'll remove it
56         * from the pagecache. Otherwise truncate wont wait on the
57         * page, allowing the disk blocks to be reused by someone else
58         * before we actually wrote our data to them. fs corruption
59         * ensues.
60         */
61        wait_on_page_writeback(page);
62
63        if (page_has_private(page) &&
64            !try_to_release_page(page, GFP_KERNEL))
65            goto out_unlock;
66
67        /*
68         * If we succeeded in removing the mapping, set LRU flag
69         * and return good.
70         */
71        if (remove_mapping(mapping, page)) {
72            buf->flags |= PIPE_BUF_FLAG_LRU;
73            return 0;
74        }
75    }
76
77    /*
78     * Raced with truncate or failed to remove page from current
79     * address space, unlock and return failure.
80     */
81out_unlock:
82    unlock_page(page);
83    return 1;
84}
85
86static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
87                    struct pipe_buffer *buf)
88{
89    page_cache_release(buf->page);
90    buf->flags &= ~PIPE_BUF_FLAG_LRU;
91}
92
93/*
94 * Check whether the contents of buf is OK to access. Since the content
95 * is a page cache page, IO may be in flight.
96 */
97static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
98                       struct pipe_buffer *buf)
99{
100    struct page *page = buf->page;
101    int err;
102
103    if (!PageUptodate(page)) {
104        lock_page(page);
105
106        /*
107         * Page got truncated/unhashed. This will cause a 0-byte
108         * splice, if this is the first page.
109         */
110        if (!page->mapping) {
111            err = -ENODATA;
112            goto error;
113        }
114
115        /*
116         * Uh oh, read-error from disk.
117         */
118        if (!PageUptodate(page)) {
119            err = -EIO;
120            goto error;
121        }
122
123        /*
124         * Page is ok afterall, we are done.
125         */
126        unlock_page(page);
127    }
128
129    return 0;
130error:
131    unlock_page(page);
132    return err;
133}
134
135static const struct pipe_buf_operations page_cache_pipe_buf_ops = {
136    .can_merge = 0,
137    .map = generic_pipe_buf_map,
138    .unmap = generic_pipe_buf_unmap,
139    .confirm = page_cache_pipe_buf_confirm,
140    .release = page_cache_pipe_buf_release,
141    .steal = page_cache_pipe_buf_steal,
142    .get = generic_pipe_buf_get,
143};
144
145static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
146                    struct pipe_buffer *buf)
147{
148    if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
149        return 1;
150
151    buf->flags |= PIPE_BUF_FLAG_LRU;
152    return generic_pipe_buf_steal(pipe, buf);
153}
154
155static const struct pipe_buf_operations user_page_pipe_buf_ops = {
156    .can_merge = 0,
157    .map = generic_pipe_buf_map,
158    .unmap = generic_pipe_buf_unmap,
159    .confirm = generic_pipe_buf_confirm,
160    .release = page_cache_pipe_buf_release,
161    .steal = user_page_pipe_buf_steal,
162    .get = generic_pipe_buf_get,
163};
164
165static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
166{
167    smp_mb();
168    if (waitqueue_active(&pipe->wait))
169        wake_up_interruptible(&pipe->wait);
170    kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
171}
172
173/**
174 * splice_to_pipe - fill passed data into a pipe
175 * @pipe: pipe to fill
176 * @spd: data to fill
177 *
178 * Description:
179 * @spd contains a map of pages and len/offset tuples, along with
180 * the struct pipe_buf_operations associated with these pages. This
181 * function will link that data to the pipe.
182 *
183 */
184ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
185               struct splice_pipe_desc *spd)
186{
187    unsigned int spd_pages = spd->nr_pages;
188    int ret, do_wakeup, page_nr;
189
190    ret = 0;
191    do_wakeup = 0;
192    page_nr = 0;
193
194    pipe_lock(pipe);
195
196    for (;;) {
197        if (!pipe->readers) {
198            send_sig(SIGPIPE, current, 0);
199            if (!ret)
200                ret = -EPIPE;
201            break;
202        }
203
204        if (pipe->nrbufs < pipe->buffers) {
205            int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
206            struct pipe_buffer *buf = pipe->bufs + newbuf;
207
208            buf->page = spd->pages[page_nr];
209            buf->offset = spd->partial[page_nr].offset;
210            buf->len = spd->partial[page_nr].len;
211            buf->private = spd->partial[page_nr].private;
212            buf->ops = spd->ops;
213            if (spd->flags & SPLICE_F_GIFT)
214                buf->flags |= PIPE_BUF_FLAG_GIFT;
215
216            pipe->nrbufs++;
217            page_nr++;
218            ret += buf->len;
219
220            if (pipe->inode)
221                do_wakeup = 1;
222
223            if (!--spd->nr_pages)
224                break;
225            if (pipe->nrbufs < pipe->buffers)
226                continue;
227
228            break;
229        }
230
231        if (spd->flags & SPLICE_F_NONBLOCK) {
232            if (!ret)
233                ret = -EAGAIN;
234            break;
235        }
236
237        if (signal_pending(current)) {
238            if (!ret)
239                ret = -ERESTARTSYS;
240            break;
241        }
242
243        if (do_wakeup) {
244            smp_mb();
245            if (waitqueue_active(&pipe->wait))
246                wake_up_interruptible_sync(&pipe->wait);
247            kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
248            do_wakeup = 0;
249        }
250
251        pipe->waiting_writers++;
252        pipe_wait(pipe);
253        pipe->waiting_writers--;
254    }
255
256    pipe_unlock(pipe);
257
258    if (do_wakeup)
259        wakeup_pipe_readers(pipe);
260
261    while (page_nr < spd_pages)
262        spd->spd_release(spd, page_nr++);
263
264    return ret;
265}
266
267static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
268{
269    page_cache_release(spd->pages[i]);
270}
271
272/*
273 * Check if we need to grow the arrays holding pages and partial page
274 * descriptions.
275 */
276int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
277{
278    if (pipe->buffers <= PIPE_DEF_BUFFERS)
279        return 0;
280
281    spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL);
282    spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL);
283
284    if (spd->pages && spd->partial)
285        return 0;
286
287    kfree(spd->pages);
288    kfree(spd->partial);
289    return -ENOMEM;
290}
291
292void splice_shrink_spd(struct pipe_inode_info *pipe,
293               struct splice_pipe_desc *spd)
294{
295    if (pipe->buffers <= PIPE_DEF_BUFFERS)
296        return;
297
298    kfree(spd->pages);
299    kfree(spd->partial);
300}
301
302static int
303__generic_file_splice_read(struct file *in, loff_t *ppos,
304               struct pipe_inode_info *pipe, size_t len,
305               unsigned int flags)
306{
307    struct address_space *mapping = in->f_mapping;
308    unsigned int loff, nr_pages, req_pages;
309    struct page *pages[PIPE_DEF_BUFFERS];
310    struct partial_page partial[PIPE_DEF_BUFFERS];
311    struct page *page;
312    pgoff_t index, end_index;
313    loff_t isize;
314    int error, page_nr;
315    struct splice_pipe_desc spd = {
316        .pages = pages,
317        .partial = partial,
318        .flags = flags,
319        .ops = &page_cache_pipe_buf_ops,
320        .spd_release = spd_release_page,
321    };
322
323    if (splice_grow_spd(pipe, &spd))
324        return -ENOMEM;
325
326    index = *ppos >> PAGE_CACHE_SHIFT;
327    loff = *ppos & ~PAGE_CACHE_MASK;
328    req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
329    nr_pages = min(req_pages, pipe->buffers);
330
331    /*
332     * Lookup the (hopefully) full range of pages we need.
333     */
334    spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
335    index += spd.nr_pages;
336
337    /*
338     * If find_get_pages_contig() returned fewer pages than we needed,
339     * readahead/allocate the rest and fill in the holes.
340     */
341    if (spd.nr_pages < nr_pages)
342        page_cache_sync_readahead(mapping, &in->f_ra, in,
343                index, req_pages - spd.nr_pages);
344
345    error = 0;
346    while (spd.nr_pages < nr_pages) {
347        /*
348         * Page could be there, find_get_pages_contig() breaks on
349         * the first hole.
350         */
351        page = find_get_page(mapping, index);
352        if (!page) {
353            /*
354             * page didn't exist, allocate one.
355             */
356            page = page_cache_alloc_cold(mapping);
357            if (!page)
358                break;
359
360            error = add_to_page_cache_lru(page, mapping, index,
361                        GFP_KERNEL);
362            if (unlikely(error)) {
363                page_cache_release(page);
364                if (error == -EEXIST)
365                    continue;
366                break;
367            }
368            /*
369             * add_to_page_cache() locks the page, unlock it
370             * to avoid convoluting the logic below even more.
371             */
372            unlock_page(page);
373        }
374
375        spd.pages[spd.nr_pages++] = page;
376        index++;
377    }
378
379    /*
380     * Now loop over the map and see if we need to start IO on any
381     * pages, fill in the partial map, etc.
382     */
383    index = *ppos >> PAGE_CACHE_SHIFT;
384    nr_pages = spd.nr_pages;
385    spd.nr_pages = 0;
386    for (page_nr = 0; page_nr < nr_pages; page_nr++) {
387        unsigned int this_len;
388
389        if (!len)
390            break;
391
392        /*
393         * this_len is the max we'll use from this page
394         */
395        this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
396        page = spd.pages[page_nr];
397
398        if (PageReadahead(page))
399            page_cache_async_readahead(mapping, &in->f_ra, in,
400                    page, index, req_pages - page_nr);
401
402        /*
403         * If the page isn't uptodate, we may need to start io on it
404         */
405        if (!PageUptodate(page)) {
406            lock_page(page);
407
408            /*
409             * Page was truncated, or invalidated by the
410             * filesystem. Redo the find/create, but this time the
411             * page is kept locked, so there's no chance of another
412             * race with truncate/invalidate.
413             */
414            if (!page->mapping) {
415                unlock_page(page);
416                page = find_or_create_page(mapping, index,
417                        mapping_gfp_mask(mapping));
418
419                if (!page) {
420                    error = -ENOMEM;
421                    break;
422                }
423                page_cache_release(spd.pages[page_nr]);
424                spd.pages[page_nr] = page;
425            }
426            /*
427             * page was already under io and is now done, great
428             */
429            if (PageUptodate(page)) {
430                unlock_page(page);
431                goto fill_it;
432            }
433
434            /*
435             * need to read in the page
436             */
437            error = mapping->a_ops->readpage(in, page);
438            if (unlikely(error)) {
439                /*
440                 * We really should re-lookup the page here,
441                 * but it complicates things a lot. Instead
442                 * lets just do what we already stored, and
443                 * we'll get it the next time we are called.
444                 */
445                if (error == AOP_TRUNCATED_PAGE)
446                    error = 0;
447
448                break;
449            }
450        }
451fill_it:
452        /*
453         * i_size must be checked after PageUptodate.
454         */
455        isize = i_size_read(mapping->host);
456        end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
457        if (unlikely(!isize || index > end_index))
458            break;
459
460        /*
461         * if this is the last page, see if we need to shrink
462         * the length and stop
463         */
464        if (end_index == index) {
465            unsigned int plen;
466
467            /*
468             * max good bytes in this page
469             */
470            plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
471            if (plen <= loff)
472                break;
473
474            /*
475             * force quit after adding this page
476             */
477            this_len = min(this_len, plen - loff);
478            len = this_len;
479        }
480
481        spd.partial[page_nr].offset = loff;
482        spd.partial[page_nr].len = this_len;
483        len -= this_len;
484        loff = 0;
485        spd.nr_pages++;
486        index++;
487    }
488
489    /*
490     * Release any pages at the end, if we quit early. 'page_nr' is how far
491     * we got, 'nr_pages' is how many pages are in the map.
492     */
493    while (page_nr < nr_pages)
494        page_cache_release(spd.pages[page_nr++]);
495    in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
496
497    if (spd.nr_pages)
498        error = splice_to_pipe(pipe, &spd);
499
500    splice_shrink_spd(pipe, &spd);
501    return error;
502}
503
504/**
505 * generic_file_splice_read - splice data from file to a pipe
506 * @in: file to splice from
507 * @ppos: position in @in
508 * @pipe: pipe to splice to
509 * @len: number of bytes to splice
510 * @flags: splice modifier flags
511 *
512 * Description:
513 * Will read pages from given file and fill them into a pipe. Can be
514 * used as long as the address_space operations for the source implements
515 * a readpage() hook.
516 *
517 */
518ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
519                 struct pipe_inode_info *pipe, size_t len,
520                 unsigned int flags)
521{
522    loff_t isize, left;
523    int ret;
524
525    isize = i_size_read(in->f_mapping->host);
526    if (unlikely(*ppos >= isize))
527        return 0;
528
529    left = isize - *ppos;
530    if (unlikely(left < len))
531        len = left;
532
533    ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
534    if (ret > 0) {
535        *ppos += ret;
536        file_accessed(in);
537    }
538
539    return ret;
540}
541EXPORT_SYMBOL(generic_file_splice_read);
542
543static const struct pipe_buf_operations default_pipe_buf_ops = {
544    .can_merge = 0,
545    .map = generic_pipe_buf_map,
546    .unmap = generic_pipe_buf_unmap,
547    .confirm = generic_pipe_buf_confirm,
548    .release = generic_pipe_buf_release,
549    .steal = generic_pipe_buf_steal,
550    .get = generic_pipe_buf_get,
551};
552
553static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
554                unsigned long vlen, loff_t offset)
555{
556    mm_segment_t old_fs;
557    loff_t pos = offset;
558    ssize_t res;
559
560    old_fs = get_fs();
561    set_fs(get_ds());
562    /* The cast to a user pointer is valid due to the set_fs() */
563    res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
564    set_fs(old_fs);
565
566    return res;
567}
568
569static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
570                loff_t pos)
571{
572    mm_segment_t old_fs;
573    ssize_t res;
574
575    old_fs = get_fs();
576    set_fs(get_ds());
577    /* The cast to a user pointer is valid due to the set_fs() */
578    res = vfs_write(file, (const char __user *)buf, count, &pos);
579    set_fs(old_fs);
580
581    return res;
582}
583
584ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
585                 struct pipe_inode_info *pipe, size_t len,
586                 unsigned int flags)
587{
588    unsigned int nr_pages;
589    unsigned int nr_freed;
590    size_t offset;
591    struct page *pages[PIPE_DEF_BUFFERS];
592    struct partial_page partial[PIPE_DEF_BUFFERS];
593    struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
594    ssize_t res;
595    size_t this_len;
596    int error;
597    int i;
598    struct splice_pipe_desc spd = {
599        .pages = pages,
600        .partial = partial,
601        .flags = flags,
602        .ops = &default_pipe_buf_ops,
603        .spd_release = spd_release_page,
604    };
605
606    if (splice_grow_spd(pipe, &spd))
607        return -ENOMEM;
608
609    res = -ENOMEM;
610    vec = __vec;
611    if (pipe->buffers > PIPE_DEF_BUFFERS) {
612        vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL);
613        if (!vec)
614            goto shrink_ret;
615    }
616
617    offset = *ppos & ~PAGE_CACHE_MASK;
618    nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
619
620    for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) {
621        struct page *page;
622
623        page = alloc_page(GFP_USER);
624        error = -ENOMEM;
625        if (!page)
626            goto err;
627
628        this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
629        vec[i].iov_base = (void __user *) page_address(page);
630        vec[i].iov_len = this_len;
631        spd.pages[i] = page;
632        spd.nr_pages++;
633        len -= this_len;
634        offset = 0;
635    }
636
637    res = kernel_readv(in, vec, spd.nr_pages, *ppos);
638    if (res < 0) {
639        error = res;
640        goto err;
641    }
642
643    error = 0;
644    if (!res)
645        goto err;
646
647    nr_freed = 0;
648    for (i = 0; i < spd.nr_pages; i++) {
649        this_len = min_t(size_t, vec[i].iov_len, res);
650        spd.partial[i].offset = 0;
651        spd.partial[i].len = this_len;
652        if (!this_len) {
653            __free_page(spd.pages[i]);
654            spd.pages[i] = NULL;
655            nr_freed++;
656        }
657        res -= this_len;
658    }
659    spd.nr_pages -= nr_freed;
660
661    res = splice_to_pipe(pipe, &spd);
662    if (res > 0)
663        *ppos += res;
664
665shrink_ret:
666    if (vec != __vec)
667        kfree(vec);
668    splice_shrink_spd(pipe, &spd);
669    return res;
670
671err:
672    for (i = 0; i < spd.nr_pages; i++)
673        __free_page(spd.pages[i]);
674
675    res = error;
676    goto shrink_ret;
677}
678EXPORT_SYMBOL(default_file_splice_read);
679
680/*
681 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
682 * using sendpage(). Return the number of bytes sent.
683 */
684static int pipe_to_sendpage(struct pipe_inode_info *pipe,
685                struct pipe_buffer *buf, struct splice_desc *sd)
686{
687    struct file *file = sd->u.file;
688    loff_t pos = sd->pos;
689    int more;
690
691    if (!likely(file->f_op && file->f_op->sendpage))
692        return -EINVAL;
693
694    more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
695    return file->f_op->sendpage(file, buf->page, buf->offset,
696                    sd->len, &pos, more);
697}
698
699/*
700 * This is a little more tricky than the file -> pipe splicing. There are
701 * basically three cases:
702 *
703 * - Destination page already exists in the address space and there
704 * are users of it. For that case we have no other option that
705 * copying the data. Tough luck.
706 * - Destination page already exists in the address space, but there
707 * are no users of it. Make sure it's uptodate, then drop it. Fall
708 * through to last case.
709 * - Destination page does not exist, we can add the pipe page to
710 * the page cache and avoid the copy.
711 *
712 * If asked to move pages to the output file (SPLICE_F_MOVE is set in
713 * sd->flags), we attempt to migrate pages from the pipe to the output
714 * file address space page cache. This is possible if no one else has
715 * the pipe page referenced outside of the pipe and page cache. If
716 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
717 * a new page in the output file page cache and fill/dirty that.
718 */
719int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
720         struct splice_desc *sd)
721{
722    struct file *file = sd->u.file;
723    struct address_space *mapping = file->f_mapping;
724    unsigned int offset, this_len;
725    struct page *page;
726    void *fsdata;
727    int ret;
728
729    offset = sd->pos & ~PAGE_CACHE_MASK;
730
731    this_len = sd->len;
732    if (this_len + offset > PAGE_CACHE_SIZE)
733        this_len = PAGE_CACHE_SIZE - offset;
734
735    ret = pagecache_write_begin(file, mapping, sd->pos, this_len,
736                AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
737    if (unlikely(ret))
738        goto out;
739
740    if (buf->page != page) {
741        /*
742         * Careful, ->map() uses KM_USER0!
743         */
744        char *src = buf->ops->map(pipe, buf, 1);
745        char *dst = kmap_atomic(page, KM_USER1);
746
747        memcpy(dst + offset, src + buf->offset, this_len);
748        flush_dcache_page(page);
749        kunmap_atomic(dst, KM_USER1);
750        buf->ops->unmap(pipe, buf, src);
751    }
752    ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
753                page, fsdata);
754out:
755    return ret;
756}
757EXPORT_SYMBOL(pipe_to_file);
758
759static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
760{
761    smp_mb();
762    if (waitqueue_active(&pipe->wait))
763        wake_up_interruptible(&pipe->wait);
764    kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
765}
766
767/**
768 * splice_from_pipe_feed - feed available data from a pipe to a file
769 * @pipe: pipe to splice from
770 * @sd: information to @actor
771 * @actor: handler that splices the data
772 *
773 * Description:
774 * This function loops over the pipe and calls @actor to do the
775 * actual moving of a single struct pipe_buffer to the desired
776 * destination. It returns when there's no more buffers left in
777 * the pipe or if the requested number of bytes (@sd->total_len)
778 * have been copied. It returns a positive number (one) if the
779 * pipe needs to be filled with more data, zero if the required
780 * number of bytes have been copied and -errno on error.
781 *
782 * This, together with splice_from_pipe_{begin,end,next}, may be
783 * used to implement the functionality of __splice_from_pipe() when
784 * locking is required around copying the pipe buffers to the
785 * destination.
786 */
787int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
788              splice_actor *actor)
789{
790    int ret;
791
792    while (pipe->nrbufs) {
793        struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
794        const struct pipe_buf_operations *ops = buf->ops;
795
796        sd->len = buf->len;
797        if (sd->len > sd->total_len)
798            sd->len = sd->total_len;
799
800        ret = buf->ops->confirm(pipe, buf);
801        if (unlikely(ret)) {
802            if (ret == -ENODATA)
803                ret = 0;
804            return ret;
805        }
806
807        ret = actor(pipe, buf, sd);
808        if (ret <= 0)
809            return ret;
810
811        buf->offset += ret;
812        buf->len -= ret;
813
814        sd->num_spliced += ret;
815        sd->len -= ret;
816        sd->pos += ret;
817        sd->total_len -= ret;
818
819        if (!buf->len) {
820            buf->ops = NULL;
821            ops->release(pipe, buf);
822            pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
823            pipe->nrbufs--;
824            if (pipe->inode)
825                sd->need_wakeup = true;
826        }
827
828        if (!sd->total_len)
829            return 0;
830    }
831
832    return 1;
833}
834EXPORT_SYMBOL(splice_from_pipe_feed);
835
836/**
837 * splice_from_pipe_next - wait for some data to splice from
838 * @pipe: pipe to splice from
839 * @sd: information about the splice operation
840 *
841 * Description:
842 * This function will wait for some data and return a positive
843 * value (one) if pipe buffers are available. It will return zero
844 * or -errno if no more data needs to be spliced.
845 */
846int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
847{
848    while (!pipe->nrbufs) {
849        if (!pipe->writers)
850            return 0;
851
852        if (!pipe->waiting_writers && sd->num_spliced)
853            return 0;
854
855        if (sd->flags & SPLICE_F_NONBLOCK)
856            return -EAGAIN;
857
858        if (signal_pending(current))
859            return -ERESTARTSYS;
860
861        if (sd->need_wakeup) {
862            wakeup_pipe_writers(pipe);
863            sd->need_wakeup = false;
864        }
865
866        pipe_wait(pipe);
867    }
868
869    return 1;
870}
871EXPORT_SYMBOL(splice_from_pipe_next);
872
873/**
874 * splice_from_pipe_begin - start splicing from pipe
875 * @sd: information about the splice operation
876 *
877 * Description:
878 * This function should be called before a loop containing
879 * splice_from_pipe_next() and splice_from_pipe_feed() to
880 * initialize the necessary fields of @sd.
881 */
882void splice_from_pipe_begin(struct splice_desc *sd)
883{
884    sd->num_spliced = 0;
885    sd->need_wakeup = false;
886}
887EXPORT_SYMBOL(splice_from_pipe_begin);
888
889/**
890 * splice_from_pipe_end - finish splicing from pipe
891 * @pipe: pipe to splice from
892 * @sd: information about the splice operation
893 *
894 * Description:
895 * This function will wake up pipe writers if necessary. It should
896 * be called after a loop containing splice_from_pipe_next() and
897 * splice_from_pipe_feed().
898 */
899void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
900{
901    if (sd->need_wakeup)
902        wakeup_pipe_writers(pipe);
903}
904EXPORT_SYMBOL(splice_from_pipe_end);
905
906/**
907 * __splice_from_pipe - splice data from a pipe to given actor
908 * @pipe: pipe to splice from
909 * @sd: information to @actor
910 * @actor: handler that splices the data
911 *
912 * Description:
913 * This function does little more than loop over the pipe and call
914 * @actor to do the actual moving of a single struct pipe_buffer to
915 * the desired destination. See pipe_to_file, pipe_to_sendpage, or
916 * pipe_to_user.
917 *
918 */
919ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
920               splice_actor *actor)
921{
922    int ret;
923
924    splice_from_pipe_begin(sd);
925    do {
926        ret = splice_from_pipe_next(pipe, sd);
927        if (ret > 0)
928            ret = splice_from_pipe_feed(pipe, sd, actor);
929    } while (ret > 0);
930    splice_from_pipe_end(pipe, sd);
931
932    return sd->num_spliced ? sd->num_spliced : ret;
933}
934EXPORT_SYMBOL(__splice_from_pipe);
935
936/**
937 * splice_from_pipe - splice data from a pipe to a file
938 * @pipe: pipe to splice from
939 * @out: file to splice to
940 * @ppos: position in @out
941 * @len: how many bytes to splice
942 * @flags: splice modifier flags
943 * @actor: handler that splices the data
944 *
945 * Description:
946 * See __splice_from_pipe. This function locks the pipe inode,
947 * otherwise it's identical to __splice_from_pipe().
948 *
949 */
950ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
951             loff_t *ppos, size_t len, unsigned int flags,
952             splice_actor *actor)
953{
954    ssize_t ret;
955    struct splice_desc sd = {
956        .total_len = len,
957        .flags = flags,
958        .pos = *ppos,
959        .u.file = out,
960    };
961
962    pipe_lock(pipe);
963    ret = __splice_from_pipe(pipe, &sd, actor);
964    pipe_unlock(pipe);
965
966    return ret;
967}
968
969/**
970 * generic_file_splice_write - splice data from a pipe to a file
971 * @pipe: pipe info
972 * @out: file to write to
973 * @ppos: position in @out
974 * @len: number of bytes to splice
975 * @flags: splice modifier flags
976 *
977 * Description:
978 * Will either move or copy pages (determined by @flags options) from
979 * the given pipe inode to the given file.
980 *
981 */
982ssize_t
983generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
984              loff_t *ppos, size_t len, unsigned int flags)
985{
986    struct address_space *mapping = out->f_mapping;
987    struct inode *inode = mapping->host;
988    struct splice_desc sd = {
989        .total_len = len,
990        .flags = flags,
991        .pos = *ppos,
992        .u.file = out,
993    };
994    ssize_t ret;
995
996    pipe_lock(pipe);
997
998    splice_from_pipe_begin(&sd);
999    do {
1000        ret = splice_from_pipe_next(pipe, &sd);
1001        if (ret <= 0)
1002            break;
1003
1004        mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
1005        ret = file_remove_suid(out);
1006        if (!ret) {
1007            file_update_time(out);
1008            ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file);
1009        }
1010        mutex_unlock(&inode->i_mutex);
1011    } while (ret > 0);
1012    splice_from_pipe_end(pipe, &sd);
1013
1014    pipe_unlock(pipe);
1015
1016    if (sd.num_spliced)
1017        ret = sd.num_spliced;
1018
1019    if (ret > 0) {
1020        unsigned long nr_pages;
1021        int err;
1022
1023        nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1024
1025        err = generic_write_sync(out, *ppos, ret);
1026        if (err)
1027            ret = err;
1028        else
1029            *ppos += ret;
1030        balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
1031    }
1032
1033    return ret;
1034}
1035
1036EXPORT_SYMBOL(generic_file_splice_write);
1037
1038static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1039              struct splice_desc *sd)
1040{
1041    int ret;
1042    void *data;
1043
1044    data = buf->ops->map(pipe, buf, 0);
1045    ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
1046    buf->ops->unmap(pipe, buf, data);
1047
1048    return ret;
1049}
1050
1051static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
1052                     struct file *out, loff_t *ppos,
1053                     size_t len, unsigned int flags)
1054{
1055    ssize_t ret;
1056
1057    ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
1058    if (ret > 0)
1059        *ppos += ret;
1060
1061    return ret;
1062}
1063
1064/**
1065 * generic_splice_sendpage - splice data from a pipe to a socket
1066 * @pipe: pipe to splice from
1067 * @out: socket to write to
1068 * @ppos: position in @out
1069 * @len: number of bytes to splice
1070 * @flags: splice modifier flags
1071 *
1072 * Description:
1073 * Will send @len bytes from the pipe to a network socket. No data copying
1074 * is involved.
1075 *
1076 */
1077ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
1078                loff_t *ppos, size_t len, unsigned int flags)
1079{
1080    return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
1081}
1082
1083EXPORT_SYMBOL(generic_splice_sendpage);
1084
1085/*
1086 * Attempt to initiate a splice from pipe to file.
1087 */
1088static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
1089               loff_t *ppos, size_t len, unsigned int flags)
1090{
1091    ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
1092                loff_t *, size_t, unsigned int);
1093    int ret;
1094
1095    if (unlikely(!(out->f_mode & FMODE_WRITE)))
1096        return -EBADF;
1097
1098    if (unlikely(out->f_flags & O_APPEND))
1099        return -EINVAL;
1100
1101    ret = rw_verify_area(WRITE, out, ppos, len);
1102    if (unlikely(ret < 0))
1103        return ret;
1104
1105    if (out->f_op && out->f_op->splice_write)
1106        splice_write = out->f_op->splice_write;
1107    else
1108        splice_write = default_file_splice_write;
1109
1110    return splice_write(pipe, out, ppos, len, flags);
1111}
1112
1113/*
1114 * Attempt to initiate a splice from a file to a pipe.
1115 */
1116static long do_splice_to(struct file *in, loff_t *ppos,
1117             struct pipe_inode_info *pipe, size_t len,
1118             unsigned int flags)
1119{
1120    ssize_t (*splice_read)(struct file *, loff_t *,
1121                   struct pipe_inode_info *, size_t, unsigned int);
1122    int ret;
1123
1124    if (unlikely(!(in->f_mode & FMODE_READ)))
1125        return -EBADF;
1126
1127    ret = rw_verify_area(READ, in, ppos, len);
1128    if (unlikely(ret < 0))
1129        return ret;
1130
1131    if (in->f_op && in->f_op->splice_read)
1132        splice_read = in->f_op->splice_read;
1133    else
1134        splice_read = default_file_splice_read;
1135
1136    return splice_read(in, ppos, pipe, len, flags);
1137}
1138
1139/**
1140 * splice_direct_to_actor - splices data directly between two non-pipes
1141 * @in: file to splice from
1142 * @sd: actor information on where to splice to
1143 * @actor: handles the data splicing
1144 *
1145 * Description:
1146 * This is a special case helper to splice directly between two
1147 * points, without requiring an explicit pipe. Internally an allocated
1148 * pipe is cached in the process, and reused during the lifetime of
1149 * that process.
1150 *
1151 */
1152ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
1153                   splice_direct_actor *actor)
1154{
1155    struct pipe_inode_info *pipe;
1156    long ret, bytes;
1157    umode_t i_mode;
1158    size_t len;
1159    int i, flags;
1160
1161    /*
1162     * We require the input being a regular file, as we don't want to
1163     * randomly drop data for eg socket -> socket splicing. Use the
1164     * piped splicing for that!
1165     */
1166    i_mode = in->f_path.dentry->d_inode->i_mode;
1167    if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
1168        return -EINVAL;
1169
1170    /*
1171     * neither in nor out is a pipe, setup an internal pipe attached to
1172     * 'out' and transfer the wanted data from 'in' to 'out' through that
1173     */
1174    pipe = current->splice_pipe;
1175    if (unlikely(!pipe)) {
1176        pipe = alloc_pipe_info(NULL);
1177        if (!pipe)
1178            return -ENOMEM;
1179
1180        /*
1181         * We don't have an immediate reader, but we'll read the stuff
1182         * out of the pipe right after the splice_to_pipe(). So set
1183         * PIPE_READERS appropriately.
1184         */
1185        pipe->readers = 1;
1186
1187        current->splice_pipe = pipe;
1188    }
1189
1190    /*
1191     * Do the splice.
1192     */
1193    ret = 0;
1194    bytes = 0;
1195    len = sd->total_len;
1196    flags = sd->flags;
1197
1198    /*
1199     * Don't block on output, we have to drain the direct pipe.
1200     */
1201    sd->flags &= ~SPLICE_F_NONBLOCK;
1202
1203    while (len) {
1204        size_t read_len;
1205        loff_t pos = sd->pos, prev_pos = pos;
1206
1207        ret = do_splice_to(in, &pos, pipe, len, flags);
1208        if (unlikely(ret <= 0))
1209            goto out_release;
1210
1211        read_len = ret;
1212        sd->total_len = read_len;
1213
1214        /*
1215         * NOTE: nonblocking mode only applies to the input. We
1216         * must not do the output in nonblocking mode as then we
1217         * could get stuck data in the internal pipe:
1218         */
1219        ret = actor(pipe, sd);
1220        if (unlikely(ret <= 0)) {
1221            sd->pos = prev_pos;
1222            goto out_release;
1223        }
1224
1225        bytes += ret;
1226        len -= ret;
1227        sd->pos = pos;
1228
1229        if (ret < read_len) {
1230            sd->pos = prev_pos + ret;
1231            goto out_release;
1232        }
1233    }
1234
1235done:
1236    pipe->nrbufs = pipe->curbuf = 0;
1237    file_accessed(in);
1238    return bytes;
1239
1240out_release:
1241    /*
1242     * If we did an incomplete transfer we must release
1243     * the pipe buffers in question:
1244     */
1245    for (i = 0; i < pipe->buffers; i++) {
1246        struct pipe_buffer *buf = pipe->bufs + i;
1247
1248        if (buf->ops) {
1249            buf->ops->release(pipe, buf);
1250            buf->ops = NULL;
1251        }
1252    }
1253
1254    if (!bytes)
1255        bytes = ret;
1256
1257    goto done;
1258}
1259EXPORT_SYMBOL(splice_direct_to_actor);
1260
1261static int direct_splice_actor(struct pipe_inode_info *pipe,
1262                   struct splice_desc *sd)
1263{
1264    struct file *file = sd->u.file;
1265
1266    return do_splice_from(pipe, file, &file->f_pos, sd->total_len,
1267                  sd->flags);
1268}
1269
1270/**
1271 * do_splice_direct - splices data directly between two files
1272 * @in: file to splice from
1273 * @ppos: input file offset
1274 * @out: file to splice to
1275 * @len: number of bytes to splice
1276 * @flags: splice modifier flags
1277 *
1278 * Description:
1279 * For use by do_sendfile(). splice can easily emulate sendfile, but
1280 * doing it in the application would incur an extra system call
1281 * (splice in + splice out, as compared to just sendfile()). So this helper
1282 * can splice directly through a process-private pipe.
1283 *
1284 */
1285long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1286              size_t len, unsigned int flags)
1287{
1288    struct splice_desc sd = {
1289        .len = len,
1290        .total_len = len,
1291        .flags = flags,
1292        .pos = *ppos,
1293        .u.file = out,
1294    };
1295    long ret;
1296
1297    ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1298    if (ret > 0)
1299        *ppos = sd.pos;
1300
1301    return ret;
1302}
1303
1304static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1305                   struct pipe_inode_info *opipe,
1306                   size_t len, unsigned int flags);
1307
1308/*
1309 * Determine where to splice to/from.
1310 */
1311static long do_splice(struct file *in, loff_t __user *off_in,
1312              struct file *out, loff_t __user *off_out,
1313              size_t len, unsigned int flags)
1314{
1315    struct pipe_inode_info *ipipe;
1316    struct pipe_inode_info *opipe;
1317    loff_t offset, *off;
1318    long ret;
1319
1320    ipipe = get_pipe_info(in);
1321    opipe = get_pipe_info(out);
1322
1323    if (ipipe && opipe) {
1324        if (off_in || off_out)
1325            return -ESPIPE;
1326
1327        if (!(in->f_mode & FMODE_READ))
1328            return -EBADF;
1329
1330        if (!(out->f_mode & FMODE_WRITE))
1331            return -EBADF;
1332
1333        /* Splicing to self would be fun, but... */
1334        if (ipipe == opipe)
1335            return -EINVAL;
1336
1337        return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1338    }
1339
1340    if (ipipe) {
1341        if (off_in)
1342            return -ESPIPE;
1343        if (off_out) {
1344            if (!(out->f_mode & FMODE_PWRITE))
1345                return -EINVAL;
1346            if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1347                return -EFAULT;
1348            off = &offset;
1349        } else
1350            off = &out->f_pos;
1351
1352        ret = do_splice_from(ipipe, out, off, len, flags);
1353
1354        if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
1355            ret = -EFAULT;
1356
1357        return ret;
1358    }
1359
1360    if (opipe) {
1361        if (off_out)
1362            return -ESPIPE;
1363        if (off_in) {
1364            if (!(in->f_mode & FMODE_PREAD))
1365                return -EINVAL;
1366            if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1367                return -EFAULT;
1368            off = &offset;
1369        } else
1370            off = &in->f_pos;
1371
1372        ret = do_splice_to(in, off, opipe, len, flags);
1373
1374        if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
1375            ret = -EFAULT;
1376
1377        return ret;
1378    }
1379
1380    return -EINVAL;
1381}
1382
1383/*
1384 * Map an iov into an array of pages and offset/length tupples. With the
1385 * partial_page structure, we can map several non-contiguous ranges into
1386 * our ones pages[] map instead of splitting that operation into pieces.
1387 * Could easily be exported as a generic helper for other users, in which
1388 * case one would probably want to add a 'max_nr_pages' parameter as well.
1389 */
1390static int get_iovec_page_array(const struct iovec __user *iov,
1391                unsigned int nr_vecs, struct page **pages,
1392                struct partial_page *partial, int aligned,
1393                unsigned int pipe_buffers)
1394{
1395    int buffers = 0, error = 0;
1396
1397    while (nr_vecs) {
1398        unsigned long off, npages;
1399        struct iovec entry;
1400        void __user *base;
1401        size_t len;
1402        int i;
1403
1404        error = -EFAULT;
1405        if (copy_from_user(&entry, iov, sizeof(entry)))
1406            break;
1407
1408        base = entry.iov_base;
1409        len = entry.iov_len;
1410
1411        /*
1412         * Sanity check this iovec. 0 read succeeds.
1413         */
1414        error = 0;
1415        if (unlikely(!len))
1416            break;
1417        error = -EFAULT;
1418        if (!access_ok(VERIFY_READ, base, len))
1419            break;
1420
1421        /*
1422         * Get this base offset and number of pages, then map
1423         * in the user pages.
1424         */
1425        off = (unsigned long) base & ~PAGE_MASK;
1426
1427        /*
1428         * If asked for alignment, the offset must be zero and the
1429         * length a multiple of the PAGE_SIZE.
1430         */
1431        error = -EINVAL;
1432        if (aligned && (off || len & ~PAGE_MASK))
1433            break;
1434
1435        npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1436        if (npages > pipe_buffers - buffers)
1437            npages = pipe_buffers - buffers;
1438
1439        error = get_user_pages_fast((unsigned long)base, npages,
1440                    0, &pages[buffers]);
1441
1442        if (unlikely(error <= 0))
1443            break;
1444
1445        /*
1446         * Fill this contiguous range into the partial page map.
1447         */
1448        for (i = 0; i < error; i++) {
1449            const int plen = min_t(size_t, len, PAGE_SIZE - off);
1450
1451            partial[buffers].offset = off;
1452            partial[buffers].len = plen;
1453
1454            off = 0;
1455            len -= plen;
1456            buffers++;
1457        }
1458
1459        /*
1460         * We didn't complete this iov, stop here since it probably
1461         * means we have to move some of this into a pipe to
1462         * be able to continue.
1463         */
1464        if (len)
1465            break;
1466
1467        /*
1468         * Don't continue if we mapped fewer pages than we asked for,
1469         * or if we mapped the max number of pages that we have
1470         * room for.
1471         */
1472        if (error < npages || buffers == pipe_buffers)
1473            break;
1474
1475        nr_vecs--;
1476        iov++;
1477    }
1478
1479    if (buffers)
1480        return buffers;
1481
1482    return error;
1483}
1484
1485static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1486            struct splice_desc *sd)
1487{
1488    char *src;
1489    int ret;
1490
1491    /*
1492     * See if we can use the atomic maps, by prefaulting in the
1493     * pages and doing an atomic copy
1494     */
1495    if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) {
1496        src = buf->ops->map(pipe, buf, 1);
1497        ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset,
1498                            sd->len);
1499        buf->ops->unmap(pipe, buf, src);
1500        if (!ret) {
1501            ret = sd->len;
1502            goto out;
1503        }
1504    }
1505
1506    /*
1507     * No dice, use slow non-atomic map and copy
1508      */
1509    src = buf->ops->map(pipe, buf, 0);
1510
1511    ret = sd->len;
1512    if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
1513        ret = -EFAULT;
1514
1515    buf->ops->unmap(pipe, buf, src);
1516out:
1517    if (ret > 0)
1518        sd->u.userptr += ret;
1519    return ret;
1520}
1521
1522/*
1523 * For lack of a better implementation, implement vmsplice() to userspace
1524 * as a simple copy of the pipes pages to the user iov.
1525 */
1526static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
1527                 unsigned long nr_segs, unsigned int flags)
1528{
1529    struct pipe_inode_info *pipe;
1530    struct splice_desc sd;
1531    ssize_t size;
1532    int error;
1533    long ret;
1534
1535    pipe = get_pipe_info(file);
1536    if (!pipe)
1537        return -EBADF;
1538
1539    pipe_lock(pipe);
1540
1541    error = ret = 0;
1542    while (nr_segs) {
1543        void __user *base;
1544        size_t len;
1545
1546        /*
1547         * Get user address base and length for this iovec.
1548         */
1549        error = get_user(base, &iov->iov_base);
1550        if (unlikely(error))
1551            break;
1552        error = get_user(len, &iov->iov_len);
1553        if (unlikely(error))
1554            break;
1555
1556        /*
1557         * Sanity check this iovec. 0 read succeeds.
1558         */
1559        if (unlikely(!len))
1560            break;
1561        if (unlikely(!base)) {
1562            error = -EFAULT;
1563            break;
1564        }
1565
1566        if (unlikely(!access_ok(VERIFY_WRITE, base, len))) {
1567            error = -EFAULT;
1568            break;
1569        }
1570
1571        sd.len = 0;
1572        sd.total_len = len;
1573        sd.flags = flags;
1574        sd.u.userptr = base;
1575        sd.pos = 0;
1576
1577        size = __splice_from_pipe(pipe, &sd, pipe_to_user);
1578        if (size < 0) {
1579            if (!ret)
1580                ret = size;
1581
1582            break;
1583        }
1584
1585        ret += size;
1586
1587        if (size < len)
1588            break;
1589
1590        nr_segs--;
1591        iov++;
1592    }
1593
1594    pipe_unlock(pipe);
1595
1596    if (!ret)
1597        ret = error;
1598
1599    return ret;
1600}
1601
1602/*
1603 * vmsplice splices a user address range into a pipe. It can be thought of
1604 * as splice-from-memory, where the regular splice is splice-from-file (or
1605 * to file). In both cases the output is a pipe, naturally.
1606 */
1607static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1608                 unsigned long nr_segs, unsigned int flags)
1609{
1610    struct pipe_inode_info *pipe;
1611    struct page *pages[PIPE_DEF_BUFFERS];
1612    struct partial_page partial[PIPE_DEF_BUFFERS];
1613    struct splice_pipe_desc spd = {
1614        .pages = pages,
1615        .partial = partial,
1616        .flags = flags,
1617        .ops = &user_page_pipe_buf_ops,
1618        .spd_release = spd_release_page,
1619    };
1620    long ret;
1621
1622    pipe = get_pipe_info(file);
1623    if (!pipe)
1624        return -EBADF;
1625
1626    if (splice_grow_spd(pipe, &spd))
1627        return -ENOMEM;
1628
1629    spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
1630                        spd.partial, flags & SPLICE_F_GIFT,
1631                        pipe->buffers);
1632    if (spd.nr_pages <= 0)
1633        ret = spd.nr_pages;
1634    else
1635        ret = splice_to_pipe(pipe, &spd);
1636
1637    splice_shrink_spd(pipe, &spd);
1638    return ret;
1639}
1640
1641/*
1642 * Note that vmsplice only really supports true splicing _from_ user memory
1643 * to a pipe, not the other way around. Splicing from user memory is a simple
1644 * operation that can be supported without any funky alignment restrictions
1645 * or nasty vm tricks. We simply map in the user memory and fill them into
1646 * a pipe. The reverse isn't quite as easy, though. There are two possible
1647 * solutions for that:
1648 *
1649 * - memcpy() the data internally, at which point we might as well just
1650 * do a regular read() on the buffer anyway.
1651 * - Lots of nasty vm tricks, that are neither fast nor flexible (it
1652 * has restriction limitations on both ends of the pipe).
1653 *
1654 * Currently we punt and implement it as a normal copy, see pipe_to_user().
1655 *
1656 */
1657SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
1658        unsigned long, nr_segs, unsigned int, flags)
1659{
1660    struct file *file;
1661    long error;
1662    int fput;
1663
1664    if (unlikely(nr_segs > UIO_MAXIOV))
1665        return -EINVAL;
1666    else if (unlikely(!nr_segs))
1667        return 0;
1668
1669    error = -EBADF;
1670    file = fget_light(fd, &fput);
1671    if (file) {
1672        if (file->f_mode & FMODE_WRITE)
1673            error = vmsplice_to_pipe(file, iov, nr_segs, flags);
1674        else if (file->f_mode & FMODE_READ)
1675            error = vmsplice_to_user(file, iov, nr_segs, flags);
1676
1677        fput_light(file, fput);
1678    }
1679
1680    return error;
1681}
1682
1683SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1684        int, fd_out, loff_t __user *, off_out,
1685        size_t, len, unsigned int, flags)
1686{
1687    long error;
1688    struct file *in, *out;
1689    int fput_in, fput_out;
1690
1691    if (unlikely(!len))
1692        return 0;
1693
1694    error = -EBADF;
1695    in = fget_light(fd_in, &fput_in);
1696    if (in) {
1697        if (in->f_mode & FMODE_READ) {
1698            out = fget_light(fd_out, &fput_out);
1699            if (out) {
1700                if (out->f_mode & FMODE_WRITE)
1701                    error = do_splice(in, off_in,
1702                              out, off_out,
1703                              len, flags);
1704                fput_light(out, fput_out);
1705            }
1706        }
1707
1708        fput_light(in, fput_in);
1709    }
1710
1711    return error;
1712}
1713
1714/*
1715 * Make sure there's data to read. Wait for input if we can, otherwise
1716 * return an appropriate error.
1717 */
1718static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1719{
1720    int ret;
1721
1722    /*
1723     * Check ->nrbufs without the inode lock first. This function
1724     * is speculative anyways, so missing one is ok.
1725     */
1726    if (pipe->nrbufs)
1727        return 0;
1728
1729    ret = 0;
1730    pipe_lock(pipe);
1731
1732    while (!pipe->nrbufs) {
1733        if (signal_pending(current)) {
1734            ret = -ERESTARTSYS;
1735            break;
1736        }
1737        if (!pipe->writers)
1738            break;
1739        if (!pipe->waiting_writers) {
1740            if (flags & SPLICE_F_NONBLOCK) {
1741                ret = -EAGAIN;
1742                break;
1743            }
1744        }
1745        pipe_wait(pipe);
1746    }
1747
1748    pipe_unlock(pipe);
1749    return ret;
1750}
1751
1752/*
1753 * Make sure there's writeable room. Wait for room if we can, otherwise
1754 * return an appropriate error.
1755 */
1756static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1757{
1758    int ret;
1759
1760    /*
1761     * Check ->nrbufs without the inode lock first. This function
1762     * is speculative anyways, so missing one is ok.
1763     */
1764    if (pipe->nrbufs < pipe->buffers)
1765        return 0;
1766
1767    ret = 0;
1768    pipe_lock(pipe);
1769
1770    while (pipe->nrbufs >= pipe->buffers) {
1771        if (!pipe->readers) {
1772            send_sig(SIGPIPE, current, 0);
1773            ret = -EPIPE;
1774            break;
1775        }
1776        if (flags & SPLICE_F_NONBLOCK) {
1777            ret = -EAGAIN;
1778            break;
1779        }
1780        if (signal_pending(current)) {
1781            ret = -ERESTARTSYS;
1782            break;
1783        }
1784        pipe->waiting_writers++;
1785        pipe_wait(pipe);
1786        pipe->waiting_writers--;
1787    }
1788
1789    pipe_unlock(pipe);
1790    return ret;
1791}
1792
1793/*
1794 * Splice contents of ipipe to opipe.
1795 */
1796static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1797                   struct pipe_inode_info *opipe,
1798                   size_t len, unsigned int flags)
1799{
1800    struct pipe_buffer *ibuf, *obuf;
1801    int ret = 0, nbuf;
1802    bool input_wakeup = false;
1803
1804
1805retry:
1806    ret = ipipe_prep(ipipe, flags);
1807    if (ret)
1808        return ret;
1809
1810    ret = opipe_prep(opipe, flags);
1811    if (ret)
1812        return ret;
1813
1814    /*
1815     * Potential ABBA deadlock, work around it by ordering lock
1816     * grabbing by pipe info address. Otherwise two different processes
1817     * could deadlock (one doing tee from A -> B, the other from B -> A).
1818     */
1819    pipe_double_lock(ipipe, opipe);
1820
1821    do {
1822        if (!opipe->readers) {
1823            send_sig(SIGPIPE, current, 0);
1824            if (!ret)
1825                ret = -EPIPE;
1826            break;
1827        }
1828
1829        if (!ipipe->nrbufs && !ipipe->writers)
1830            break;
1831
1832        /*
1833         * Cannot make any progress, because either the input
1834         * pipe is empty or the output pipe is full.
1835         */
1836        if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
1837            /* Already processed some buffers, break */
1838            if (ret)
1839                break;
1840
1841            if (flags & SPLICE_F_NONBLOCK) {
1842                ret = -EAGAIN;
1843                break;
1844            }
1845
1846            /*
1847             * We raced with another reader/writer and haven't
1848             * managed to process any buffers. A zero return
1849             * value means EOF, so retry instead.
1850             */
1851            pipe_unlock(ipipe);
1852            pipe_unlock(opipe);
1853            goto retry;
1854        }
1855
1856        ibuf = ipipe->bufs + ipipe->curbuf;
1857        nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1858        obuf = opipe->bufs + nbuf;
1859
1860        if (len >= ibuf->len) {
1861            /*
1862             * Simply move the whole buffer from ipipe to opipe
1863             */
1864            *obuf = *ibuf;
1865            ibuf->ops = NULL;
1866            opipe->nrbufs++;
1867            ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
1868            ipipe->nrbufs--;
1869            input_wakeup = true;
1870        } else {
1871            /*
1872             * Get a reference to this pipe buffer,
1873             * so we can copy the contents over.
1874             */
1875            ibuf->ops->get(ipipe, ibuf);
1876            *obuf = *ibuf;
1877
1878            /*
1879             * Don't inherit the gift flag, we need to
1880             * prevent multiple steals of this page.
1881             */
1882            obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1883
1884            obuf->len = len;
1885            opipe->nrbufs++;
1886            ibuf->offset += obuf->len;
1887            ibuf->len -= obuf->len;
1888        }
1889        ret += obuf->len;
1890        len -= obuf->len;
1891    } while (len);
1892
1893    pipe_unlock(ipipe);
1894    pipe_unlock(opipe);
1895
1896    /*
1897     * If we put data in the output pipe, wakeup any potential readers.
1898     */
1899    if (ret > 0)
1900        wakeup_pipe_readers(opipe);
1901
1902    if (input_wakeup)
1903        wakeup_pipe_writers(ipipe);
1904
1905    return ret;
1906}
1907
1908/*
1909 * Link contents of ipipe to opipe.
1910 */
1911static int link_pipe(struct pipe_inode_info *ipipe,
1912             struct pipe_inode_info *opipe,
1913             size_t len, unsigned int flags)
1914{
1915    struct pipe_buffer *ibuf, *obuf;
1916    int ret = 0, i = 0, nbuf;
1917
1918    /*
1919     * Potential ABBA deadlock, work around it by ordering lock
1920     * grabbing by pipe info address. Otherwise two different processes
1921     * could deadlock (one doing tee from A -> B, the other from B -> A).
1922     */
1923    pipe_double_lock(ipipe, opipe);
1924
1925    do {
1926        if (!opipe->readers) {
1927            send_sig(SIGPIPE, current, 0);
1928            if (!ret)
1929                ret = -EPIPE;
1930            break;
1931        }
1932
1933        /*
1934         * If we have iterated all input buffers or ran out of
1935         * output room, break.
1936         */
1937        if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
1938            break;
1939
1940        ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
1941        nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1942
1943        /*
1944         * Get a reference to this pipe buffer,
1945         * so we can copy the contents over.
1946         */
1947        ibuf->ops->get(ipipe, ibuf);
1948
1949        obuf = opipe->bufs + nbuf;
1950        *obuf = *ibuf;
1951
1952        /*
1953         * Don't inherit the gift flag, we need to
1954         * prevent multiple steals of this page.
1955         */
1956        obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1957
1958        if (obuf->len > len)
1959            obuf->len = len;
1960
1961        opipe->nrbufs++;
1962        ret += obuf->len;
1963        len -= obuf->len;
1964        i++;
1965    } while (len);
1966
1967    /*
1968     * return EAGAIN if we have the potential of some data in the
1969     * future, otherwise just return 0
1970     */
1971    if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
1972        ret = -EAGAIN;
1973
1974    pipe_unlock(ipipe);
1975    pipe_unlock(opipe);
1976
1977    /*
1978     * If we put data in the output pipe, wakeup any potential readers.
1979     */
1980    if (ret > 0)
1981        wakeup_pipe_readers(opipe);
1982
1983    return ret;
1984}
1985
1986/*
1987 * This is a tee(1) implementation that works on pipes. It doesn't copy
1988 * any data, it simply references the 'in' pages on the 'out' pipe.
1989 * The 'flags' used are the SPLICE_F_* variants, currently the only
1990 * applicable one is SPLICE_F_NONBLOCK.
1991 */
1992static long do_tee(struct file *in, struct file *out, size_t len,
1993           unsigned int flags)
1994{
1995    struct pipe_inode_info *ipipe = get_pipe_info(in);
1996    struct pipe_inode_info *opipe = get_pipe_info(out);
1997    int ret = -EINVAL;
1998
1999    /*
2000     * Duplicate the contents of ipipe to opipe without actually
2001     * copying the data.
2002     */
2003    if (ipipe && opipe && ipipe != opipe) {
2004        /*
2005         * Keep going, unless we encounter an error. The ipipe/opipe
2006         * ordering doesn't really matter.
2007         */
2008        ret = ipipe_prep(ipipe, flags);
2009        if (!ret) {
2010            ret = opipe_prep(opipe, flags);
2011            if (!ret)
2012                ret = link_pipe(ipipe, opipe, len, flags);
2013        }
2014    }
2015
2016    return ret;
2017}
2018
2019SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
2020{
2021    struct file *in;
2022    int error, fput_in;
2023
2024    if (unlikely(!len))
2025        return 0;
2026
2027    error = -EBADF;
2028    in = fget_light(fdin, &fput_in);
2029    if (in) {
2030        if (in->f_mode & FMODE_READ) {
2031            int fput_out;
2032            struct file *out = fget_light(fdout, &fput_out);
2033
2034            if (out) {
2035                if (out->f_mode & FMODE_WRITE)
2036                    error = do_tee(in, out, len, flags);
2037                fput_light(out, fput_out);
2038            }
2039        }
2040         fput_light(in, fput_in);
2041     }
2042
2043    return error;
2044}
2045

Archive Download this file



interactive