Root/fs/splice.c

1/*
2 * "splice": joining two ropes together by interweaving their strands.
3 *
4 * This is the "extended pipe" functionality, where a pipe is used as
5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
6 * buffer that you can use to transfer data from one end to the other.
7 *
8 * The traditional unix read/write is extended with a "splice()" operation
9 * that transfers data buffers to or from a pipe buffer.
10 *
11 * Named by Larry McVoy, original implementation from Linus, extended by
12 * Jens to support splicing to files, network, direct splicing, etc and
13 * fixing lots of bugs.
14 *
15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
18 *
19 */
20#include <linux/fs.h>
21#include <linux/file.h>
22#include <linux/pagemap.h>
23#include <linux/splice.h>
24#include <linux/memcontrol.h>
25#include <linux/mm_inline.h>
26#include <linux/swap.h>
27#include <linux/writeback.h>
28#include <linux/buffer_head.h>
29#include <linux/module.h>
30#include <linux/syscalls.h>
31#include <linux/uio.h>
32#include <linux/security.h>
33
34/*
35 * Attempt to steal a page from a pipe buffer. This should perhaps go into
36 * a vm helper function, it's already simplified quite a bit by the
37 * addition of remove_mapping(). If success is returned, the caller may
38 * attempt to reuse this page for another destination.
39 */
40static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
41                     struct pipe_buffer *buf)
42{
43    struct page *page = buf->page;
44    struct address_space *mapping;
45
46    lock_page(page);
47
48    mapping = page_mapping(page);
49    if (mapping) {
50        WARN_ON(!PageUptodate(page));
51
52        /*
53         * At least for ext2 with nobh option, we need to wait on
54         * writeback completing on this page, since we'll remove it
55         * from the pagecache. Otherwise truncate wont wait on the
56         * page, allowing the disk blocks to be reused by someone else
57         * before we actually wrote our data to them. fs corruption
58         * ensues.
59         */
60        wait_on_page_writeback(page);
61
62        if (page_has_private(page) &&
63            !try_to_release_page(page, GFP_KERNEL))
64            goto out_unlock;
65
66        /*
67         * If we succeeded in removing the mapping, set LRU flag
68         * and return good.
69         */
70        if (remove_mapping(mapping, page)) {
71            buf->flags |= PIPE_BUF_FLAG_LRU;
72            return 0;
73        }
74    }
75
76    /*
77     * Raced with truncate or failed to remove page from current
78     * address space, unlock and return failure.
79     */
80out_unlock:
81    unlock_page(page);
82    return 1;
83}
84
85static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
86                    struct pipe_buffer *buf)
87{
88    page_cache_release(buf->page);
89    buf->flags &= ~PIPE_BUF_FLAG_LRU;
90}
91
92/*
93 * Check whether the contents of buf is OK to access. Since the content
94 * is a page cache page, IO may be in flight.
95 */
96static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
97                       struct pipe_buffer *buf)
98{
99    struct page *page = buf->page;
100    int err;
101
102    if (!PageUptodate(page)) {
103        lock_page(page);
104
105        /*
106         * Page got truncated/unhashed. This will cause a 0-byte
107         * splice, if this is the first page.
108         */
109        if (!page->mapping) {
110            err = -ENODATA;
111            goto error;
112        }
113
114        /*
115         * Uh oh, read-error from disk.
116         */
117        if (!PageUptodate(page)) {
118            err = -EIO;
119            goto error;
120        }
121
122        /*
123         * Page is ok afterall, we are done.
124         */
125        unlock_page(page);
126    }
127
128    return 0;
129error:
130    unlock_page(page);
131    return err;
132}
133
134static const struct pipe_buf_operations page_cache_pipe_buf_ops = {
135    .can_merge = 0,
136    .map = generic_pipe_buf_map,
137    .unmap = generic_pipe_buf_unmap,
138    .confirm = page_cache_pipe_buf_confirm,
139    .release = page_cache_pipe_buf_release,
140    .steal = page_cache_pipe_buf_steal,
141    .get = generic_pipe_buf_get,
142};
143
144static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
145                    struct pipe_buffer *buf)
146{
147    if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
148        return 1;
149
150    buf->flags |= PIPE_BUF_FLAG_LRU;
151    return generic_pipe_buf_steal(pipe, buf);
152}
153
154static const struct pipe_buf_operations user_page_pipe_buf_ops = {
155    .can_merge = 0,
156    .map = generic_pipe_buf_map,
157    .unmap = generic_pipe_buf_unmap,
158    .confirm = generic_pipe_buf_confirm,
159    .release = page_cache_pipe_buf_release,
160    .steal = user_page_pipe_buf_steal,
161    .get = generic_pipe_buf_get,
162};
163
164/**
165 * splice_to_pipe - fill passed data into a pipe
166 * @pipe: pipe to fill
167 * @spd: data to fill
168 *
169 * Description:
170 * @spd contains a map of pages and len/offset tuples, along with
171 * the struct pipe_buf_operations associated with these pages. This
172 * function will link that data to the pipe.
173 *
174 */
175ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
176               struct splice_pipe_desc *spd)
177{
178    unsigned int spd_pages = spd->nr_pages;
179    int ret, do_wakeup, page_nr;
180
181    ret = 0;
182    do_wakeup = 0;
183    page_nr = 0;
184
185    pipe_lock(pipe);
186
187    for (;;) {
188        if (!pipe->readers) {
189            send_sig(SIGPIPE, current, 0);
190            if (!ret)
191                ret = -EPIPE;
192            break;
193        }
194
195        if (pipe->nrbufs < PIPE_BUFFERS) {
196            int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
197            struct pipe_buffer *buf = pipe->bufs + newbuf;
198
199            buf->page = spd->pages[page_nr];
200            buf->offset = spd->partial[page_nr].offset;
201            buf->len = spd->partial[page_nr].len;
202            buf->private = spd->partial[page_nr].private;
203            buf->ops = spd->ops;
204            if (spd->flags & SPLICE_F_GIFT)
205                buf->flags |= PIPE_BUF_FLAG_GIFT;
206
207            pipe->nrbufs++;
208            page_nr++;
209            ret += buf->len;
210
211            if (pipe->inode)
212                do_wakeup = 1;
213
214            if (!--spd->nr_pages)
215                break;
216            if (pipe->nrbufs < PIPE_BUFFERS)
217                continue;
218
219            break;
220        }
221
222        if (spd->flags & SPLICE_F_NONBLOCK) {
223            if (!ret)
224                ret = -EAGAIN;
225            break;
226        }
227
228        if (signal_pending(current)) {
229            if (!ret)
230                ret = -ERESTARTSYS;
231            break;
232        }
233
234        if (do_wakeup) {
235            smp_mb();
236            if (waitqueue_active(&pipe->wait))
237                wake_up_interruptible_sync(&pipe->wait);
238            kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
239            do_wakeup = 0;
240        }
241
242        pipe->waiting_writers++;
243        pipe_wait(pipe);
244        pipe->waiting_writers--;
245    }
246
247    pipe_unlock(pipe);
248
249    if (do_wakeup) {
250        smp_mb();
251        if (waitqueue_active(&pipe->wait))
252            wake_up_interruptible(&pipe->wait);
253        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
254    }
255
256    while (page_nr < spd_pages)
257        spd->spd_release(spd, page_nr++);
258
259    return ret;
260}
261
262static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
263{
264    page_cache_release(spd->pages[i]);
265}
266
267static int
268__generic_file_splice_read(struct file *in, loff_t *ppos,
269               struct pipe_inode_info *pipe, size_t len,
270               unsigned int flags)
271{
272    struct address_space *mapping = in->f_mapping;
273    unsigned int loff, nr_pages, req_pages;
274    struct page *pages[PIPE_BUFFERS];
275    struct partial_page partial[PIPE_BUFFERS];
276    struct page *page;
277    pgoff_t index, end_index;
278    loff_t isize;
279    int error, page_nr;
280    struct splice_pipe_desc spd = {
281        .pages = pages,
282        .partial = partial,
283        .flags = flags,
284        .ops = &page_cache_pipe_buf_ops,
285        .spd_release = spd_release_page,
286    };
287
288    index = *ppos >> PAGE_CACHE_SHIFT;
289    loff = *ppos & ~PAGE_CACHE_MASK;
290    req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
291    nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS);
292
293    /*
294     * Lookup the (hopefully) full range of pages we need.
295     */
296    spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
297    index += spd.nr_pages;
298
299    /*
300     * If find_get_pages_contig() returned fewer pages than we needed,
301     * readahead/allocate the rest and fill in the holes.
302     */
303    if (spd.nr_pages < nr_pages)
304        page_cache_sync_readahead(mapping, &in->f_ra, in,
305                index, req_pages - spd.nr_pages);
306
307    error = 0;
308    while (spd.nr_pages < nr_pages) {
309        /*
310         * Page could be there, find_get_pages_contig() breaks on
311         * the first hole.
312         */
313        page = find_get_page(mapping, index);
314        if (!page) {
315            /*
316             * page didn't exist, allocate one.
317             */
318            page = page_cache_alloc_cold(mapping);
319            if (!page)
320                break;
321
322            error = add_to_page_cache_lru(page, mapping, index,
323                        mapping_gfp_mask(mapping));
324            if (unlikely(error)) {
325                page_cache_release(page);
326                if (error == -EEXIST)
327                    continue;
328                break;
329            }
330            /*
331             * add_to_page_cache() locks the page, unlock it
332             * to avoid convoluting the logic below even more.
333             */
334            unlock_page(page);
335        }
336
337        pages[spd.nr_pages++] = page;
338        index++;
339    }
340
341    /*
342     * Now loop over the map and see if we need to start IO on any
343     * pages, fill in the partial map, etc.
344     */
345    index = *ppos >> PAGE_CACHE_SHIFT;
346    nr_pages = spd.nr_pages;
347    spd.nr_pages = 0;
348    for (page_nr = 0; page_nr < nr_pages; page_nr++) {
349        unsigned int this_len;
350
351        if (!len)
352            break;
353
354        /*
355         * this_len is the max we'll use from this page
356         */
357        this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
358        page = pages[page_nr];
359
360        if (PageReadahead(page))
361            page_cache_async_readahead(mapping, &in->f_ra, in,
362                    page, index, req_pages - page_nr);
363
364        /*
365         * If the page isn't uptodate, we may need to start io on it
366         */
367        if (!PageUptodate(page)) {
368            /*
369             * If in nonblock mode then dont block on waiting
370             * for an in-flight io page
371             */
372            if (flags & SPLICE_F_NONBLOCK) {
373                if (!trylock_page(page)) {
374                    error = -EAGAIN;
375                    break;
376                }
377            } else
378                lock_page(page);
379
380            /*
381             * Page was truncated, or invalidated by the
382             * filesystem. Redo the find/create, but this time the
383             * page is kept locked, so there's no chance of another
384             * race with truncate/invalidate.
385             */
386            if (!page->mapping) {
387                unlock_page(page);
388                page = find_or_create_page(mapping, index,
389                        mapping_gfp_mask(mapping));
390
391                if (!page) {
392                    error = -ENOMEM;
393                    break;
394                }
395                page_cache_release(pages[page_nr]);
396                pages[page_nr] = page;
397            }
398            /*
399             * page was already under io and is now done, great
400             */
401            if (PageUptodate(page)) {
402                unlock_page(page);
403                goto fill_it;
404            }
405
406            /*
407             * need to read in the page
408             */
409            error = mapping->a_ops->readpage(in, page);
410            if (unlikely(error)) {
411                /*
412                 * We really should re-lookup the page here,
413                 * but it complicates things a lot. Instead
414                 * lets just do what we already stored, and
415                 * we'll get it the next time we are called.
416                 */
417                if (error == AOP_TRUNCATED_PAGE)
418                    error = 0;
419
420                break;
421            }
422        }
423fill_it:
424        /*
425         * i_size must be checked after PageUptodate.
426         */
427        isize = i_size_read(mapping->host);
428        end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
429        if (unlikely(!isize || index > end_index))
430            break;
431
432        /*
433         * if this is the last page, see if we need to shrink
434         * the length and stop
435         */
436        if (end_index == index) {
437            unsigned int plen;
438
439            /*
440             * max good bytes in this page
441             */
442            plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
443            if (plen <= loff)
444                break;
445
446            /*
447             * force quit after adding this page
448             */
449            this_len = min(this_len, plen - loff);
450            len = this_len;
451        }
452
453        partial[page_nr].offset = loff;
454        partial[page_nr].len = this_len;
455        len -= this_len;
456        loff = 0;
457        spd.nr_pages++;
458        index++;
459    }
460
461    /*
462     * Release any pages at the end, if we quit early. 'page_nr' is how far
463     * we got, 'nr_pages' is how many pages are in the map.
464     */
465    while (page_nr < nr_pages)
466        page_cache_release(pages[page_nr++]);
467    in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
468
469    if (spd.nr_pages)
470        return splice_to_pipe(pipe, &spd);
471
472    return error;
473}
474
475/**
476 * generic_file_splice_read - splice data from file to a pipe
477 * @in: file to splice from
478 * @ppos: position in @in
479 * @pipe: pipe to splice to
480 * @len: number of bytes to splice
481 * @flags: splice modifier flags
482 *
483 * Description:
484 * Will read pages from given file and fill them into a pipe. Can be
485 * used as long as the address_space operations for the source implements
486 * a readpage() hook.
487 *
488 */
489ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
490                 struct pipe_inode_info *pipe, size_t len,
491                 unsigned int flags)
492{
493    loff_t isize, left;
494    int ret;
495
496    isize = i_size_read(in->f_mapping->host);
497    if (unlikely(*ppos >= isize))
498        return 0;
499
500    left = isize - *ppos;
501    if (unlikely(left < len))
502        len = left;
503
504    ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
505    if (ret > 0)
506        *ppos += ret;
507
508    return ret;
509}
510EXPORT_SYMBOL(generic_file_splice_read);
511
512static const struct pipe_buf_operations default_pipe_buf_ops = {
513    .can_merge = 0,
514    .map = generic_pipe_buf_map,
515    .unmap = generic_pipe_buf_unmap,
516    .confirm = generic_pipe_buf_confirm,
517    .release = generic_pipe_buf_release,
518    .steal = generic_pipe_buf_steal,
519    .get = generic_pipe_buf_get,
520};
521
522static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
523                unsigned long vlen, loff_t offset)
524{
525    mm_segment_t old_fs;
526    loff_t pos = offset;
527    ssize_t res;
528
529    old_fs = get_fs();
530    set_fs(get_ds());
531    /* The cast to a user pointer is valid due to the set_fs() */
532    res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
533    set_fs(old_fs);
534
535    return res;
536}
537
538static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
539                loff_t pos)
540{
541    mm_segment_t old_fs;
542    ssize_t res;
543
544    old_fs = get_fs();
545    set_fs(get_ds());
546    /* The cast to a user pointer is valid due to the set_fs() */
547    res = vfs_write(file, (const char __user *)buf, count, &pos);
548    set_fs(old_fs);
549
550    return res;
551}
552
553ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
554                 struct pipe_inode_info *pipe, size_t len,
555                 unsigned int flags)
556{
557    unsigned int nr_pages;
558    unsigned int nr_freed;
559    size_t offset;
560    struct page *pages[PIPE_BUFFERS];
561    struct partial_page partial[PIPE_BUFFERS];
562    struct iovec vec[PIPE_BUFFERS];
563    pgoff_t index;
564    ssize_t res;
565    size_t this_len;
566    int error;
567    int i;
568    struct splice_pipe_desc spd = {
569        .pages = pages,
570        .partial = partial,
571        .flags = flags,
572        .ops = &default_pipe_buf_ops,
573        .spd_release = spd_release_page,
574    };
575
576    index = *ppos >> PAGE_CACHE_SHIFT;
577    offset = *ppos & ~PAGE_CACHE_MASK;
578    nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
579
580    for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) {
581        struct page *page;
582
583        page = alloc_page(GFP_USER);
584        error = -ENOMEM;
585        if (!page)
586            goto err;
587
588        this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
589        vec[i].iov_base = (void __user *) page_address(page);
590        vec[i].iov_len = this_len;
591        pages[i] = page;
592        spd.nr_pages++;
593        len -= this_len;
594        offset = 0;
595    }
596
597    res = kernel_readv(in, vec, spd.nr_pages, *ppos);
598    if (res < 0) {
599        error = res;
600        goto err;
601    }
602
603    error = 0;
604    if (!res)
605        goto err;
606
607    nr_freed = 0;
608    for (i = 0; i < spd.nr_pages; i++) {
609        this_len = min_t(size_t, vec[i].iov_len, res);
610        partial[i].offset = 0;
611        partial[i].len = this_len;
612        if (!this_len) {
613            __free_page(pages[i]);
614            pages[i] = NULL;
615            nr_freed++;
616        }
617        res -= this_len;
618    }
619    spd.nr_pages -= nr_freed;
620
621    res = splice_to_pipe(pipe, &spd);
622    if (res > 0)
623        *ppos += res;
624
625    return res;
626
627err:
628    for (i = 0; i < spd.nr_pages; i++)
629        __free_page(pages[i]);
630
631    return error;
632}
633EXPORT_SYMBOL(default_file_splice_read);
634
635/*
636 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
637 * using sendpage(). Return the number of bytes sent.
638 */
639static int pipe_to_sendpage(struct pipe_inode_info *pipe,
640                struct pipe_buffer *buf, struct splice_desc *sd)
641{
642    struct file *file = sd->u.file;
643    loff_t pos = sd->pos;
644    int ret, more;
645
646    ret = buf->ops->confirm(pipe, buf);
647    if (!ret) {
648        more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
649
650        ret = file->f_op->sendpage(file, buf->page, buf->offset,
651                       sd->len, &pos, more);
652    }
653
654    return ret;
655}
656
657/*
658 * This is a little more tricky than the file -> pipe splicing. There are
659 * basically three cases:
660 *
661 * - Destination page already exists in the address space and there
662 * are users of it. For that case we have no other option that
663 * copying the data. Tough luck.
664 * - Destination page already exists in the address space, but there
665 * are no users of it. Make sure it's uptodate, then drop it. Fall
666 * through to last case.
667 * - Destination page does not exist, we can add the pipe page to
668 * the page cache and avoid the copy.
669 *
670 * If asked to move pages to the output file (SPLICE_F_MOVE is set in
671 * sd->flags), we attempt to migrate pages from the pipe to the output
672 * file address space page cache. This is possible if no one else has
673 * the pipe page referenced outside of the pipe and page cache. If
674 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
675 * a new page in the output file page cache and fill/dirty that.
676 */
677int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
678         struct splice_desc *sd)
679{
680    struct file *file = sd->u.file;
681    struct address_space *mapping = file->f_mapping;
682    unsigned int offset, this_len;
683    struct page *page;
684    void *fsdata;
685    int ret;
686
687    /*
688     * make sure the data in this buffer is uptodate
689     */
690    ret = buf->ops->confirm(pipe, buf);
691    if (unlikely(ret))
692        return ret;
693
694    offset = sd->pos & ~PAGE_CACHE_MASK;
695
696    this_len = sd->len;
697    if (this_len + offset > PAGE_CACHE_SIZE)
698        this_len = PAGE_CACHE_SIZE - offset;
699
700    ret = pagecache_write_begin(file, mapping, sd->pos, this_len,
701                AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
702    if (unlikely(ret))
703        goto out;
704
705    if (buf->page != page) {
706        /*
707         * Careful, ->map() uses KM_USER0!
708         */
709        char *src = buf->ops->map(pipe, buf, 1);
710        char *dst = kmap_atomic(page, KM_USER1);
711
712        memcpy(dst + offset, src + buf->offset, this_len);
713        flush_dcache_page(page);
714        kunmap_atomic(dst, KM_USER1);
715        buf->ops->unmap(pipe, buf, src);
716    }
717    ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
718                page, fsdata);
719out:
720    return ret;
721}
722EXPORT_SYMBOL(pipe_to_file);
723
724static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
725{
726    smp_mb();
727    if (waitqueue_active(&pipe->wait))
728        wake_up_interruptible(&pipe->wait);
729    kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
730}
731
732/**
733 * splice_from_pipe_feed - feed available data from a pipe to a file
734 * @pipe: pipe to splice from
735 * @sd: information to @actor
736 * @actor: handler that splices the data
737 *
738 * Description:
739 * This function loops over the pipe and calls @actor to do the
740 * actual moving of a single struct pipe_buffer to the desired
741 * destination. It returns when there's no more buffers left in
742 * the pipe or if the requested number of bytes (@sd->total_len)
743 * have been copied. It returns a positive number (one) if the
744 * pipe needs to be filled with more data, zero if the required
745 * number of bytes have been copied and -errno on error.
746 *
747 * This, together with splice_from_pipe_{begin,end,next}, may be
748 * used to implement the functionality of __splice_from_pipe() when
749 * locking is required around copying the pipe buffers to the
750 * destination.
751 */
752int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
753              splice_actor *actor)
754{
755    int ret;
756
757    while (pipe->nrbufs) {
758        struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
759        const struct pipe_buf_operations *ops = buf->ops;
760
761        sd->len = buf->len;
762        if (sd->len > sd->total_len)
763            sd->len = sd->total_len;
764
765        ret = actor(pipe, buf, sd);
766        if (ret <= 0) {
767            if (ret == -ENODATA)
768                ret = 0;
769            return ret;
770        }
771        buf->offset += ret;
772        buf->len -= ret;
773
774        sd->num_spliced += ret;
775        sd->len -= ret;
776        sd->pos += ret;
777        sd->total_len -= ret;
778
779        if (!buf->len) {
780            buf->ops = NULL;
781            ops->release(pipe, buf);
782            pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
783            pipe->nrbufs--;
784            if (pipe->inode)
785                sd->need_wakeup = true;
786        }
787
788        if (!sd->total_len)
789            return 0;
790    }
791
792    return 1;
793}
794EXPORT_SYMBOL(splice_from_pipe_feed);
795
796/**
797 * splice_from_pipe_next - wait for some data to splice from
798 * @pipe: pipe to splice from
799 * @sd: information about the splice operation
800 *
801 * Description:
802 * This function will wait for some data and return a positive
803 * value (one) if pipe buffers are available. It will return zero
804 * or -errno if no more data needs to be spliced.
805 */
806int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
807{
808    while (!pipe->nrbufs) {
809        if (!pipe->writers)
810            return 0;
811
812        if (!pipe->waiting_writers && sd->num_spliced)
813            return 0;
814
815        if (sd->flags & SPLICE_F_NONBLOCK)
816            return -EAGAIN;
817
818        if (signal_pending(current))
819            return -ERESTARTSYS;
820
821        if (sd->need_wakeup) {
822            wakeup_pipe_writers(pipe);
823            sd->need_wakeup = false;
824        }
825
826        pipe_wait(pipe);
827    }
828
829    return 1;
830}
831EXPORT_SYMBOL(splice_from_pipe_next);
832
833/**
834 * splice_from_pipe_begin - start splicing from pipe
835 * @sd: information about the splice operation
836 *
837 * Description:
838 * This function should be called before a loop containing
839 * splice_from_pipe_next() and splice_from_pipe_feed() to
840 * initialize the necessary fields of @sd.
841 */
842void splice_from_pipe_begin(struct splice_desc *sd)
843{
844    sd->num_spliced = 0;
845    sd->need_wakeup = false;
846}
847EXPORT_SYMBOL(splice_from_pipe_begin);
848
849/**
850 * splice_from_pipe_end - finish splicing from pipe
851 * @pipe: pipe to splice from
852 * @sd: information about the splice operation
853 *
854 * Description:
855 * This function will wake up pipe writers if necessary. It should
856 * be called after a loop containing splice_from_pipe_next() and
857 * splice_from_pipe_feed().
858 */
859void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
860{
861    if (sd->need_wakeup)
862        wakeup_pipe_writers(pipe);
863}
864EXPORT_SYMBOL(splice_from_pipe_end);
865
866/**
867 * __splice_from_pipe - splice data from a pipe to given actor
868 * @pipe: pipe to splice from
869 * @sd: information to @actor
870 * @actor: handler that splices the data
871 *
872 * Description:
873 * This function does little more than loop over the pipe and call
874 * @actor to do the actual moving of a single struct pipe_buffer to
875 * the desired destination. See pipe_to_file, pipe_to_sendpage, or
876 * pipe_to_user.
877 *
878 */
879ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
880               splice_actor *actor)
881{
882    int ret;
883
884    splice_from_pipe_begin(sd);
885    do {
886        ret = splice_from_pipe_next(pipe, sd);
887        if (ret > 0)
888            ret = splice_from_pipe_feed(pipe, sd, actor);
889    } while (ret > 0);
890    splice_from_pipe_end(pipe, sd);
891
892    return sd->num_spliced ? sd->num_spliced : ret;
893}
894EXPORT_SYMBOL(__splice_from_pipe);
895
896/**
897 * splice_from_pipe - splice data from a pipe to a file
898 * @pipe: pipe to splice from
899 * @out: file to splice to
900 * @ppos: position in @out
901 * @len: how many bytes to splice
902 * @flags: splice modifier flags
903 * @actor: handler that splices the data
904 *
905 * Description:
906 * See __splice_from_pipe. This function locks the pipe inode,
907 * otherwise it's identical to __splice_from_pipe().
908 *
909 */
910ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
911             loff_t *ppos, size_t len, unsigned int flags,
912             splice_actor *actor)
913{
914    ssize_t ret;
915    struct splice_desc sd = {
916        .total_len = len,
917        .flags = flags,
918        .pos = *ppos,
919        .u.file = out,
920    };
921
922    pipe_lock(pipe);
923    ret = __splice_from_pipe(pipe, &sd, actor);
924    pipe_unlock(pipe);
925
926    return ret;
927}
928
929/**
930 * generic_file_splice_write - splice data from a pipe to a file
931 * @pipe: pipe info
932 * @out: file to write to
933 * @ppos: position in @out
934 * @len: number of bytes to splice
935 * @flags: splice modifier flags
936 *
937 * Description:
938 * Will either move or copy pages (determined by @flags options) from
939 * the given pipe inode to the given file.
940 *
941 */
942ssize_t
943generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
944              loff_t *ppos, size_t len, unsigned int flags)
945{
946    struct address_space *mapping = out->f_mapping;
947    struct inode *inode = mapping->host;
948    struct splice_desc sd = {
949        .total_len = len,
950        .flags = flags,
951        .pos = *ppos,
952        .u.file = out,
953    };
954    ssize_t ret;
955
956    pipe_lock(pipe);
957
958    splice_from_pipe_begin(&sd);
959    do {
960        ret = splice_from_pipe_next(pipe, &sd);
961        if (ret <= 0)
962            break;
963
964        mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
965        ret = file_remove_suid(out);
966        if (!ret)
967            ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file);
968        mutex_unlock(&inode->i_mutex);
969    } while (ret > 0);
970    splice_from_pipe_end(pipe, &sd);
971
972    pipe_unlock(pipe);
973
974    if (sd.num_spliced)
975        ret = sd.num_spliced;
976
977    if (ret > 0) {
978        unsigned long nr_pages;
979
980        *ppos += ret;
981        nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
982
983        /*
984         * If file or inode is SYNC and we actually wrote some data,
985         * sync it.
986         */
987        if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
988            int err;
989
990            mutex_lock(&inode->i_mutex);
991            err = generic_osync_inode(inode, mapping,
992                          OSYNC_METADATA|OSYNC_DATA);
993            mutex_unlock(&inode->i_mutex);
994
995            if (err)
996                ret = err;
997        }
998        balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
999    }
1000
1001    return ret;
1002}
1003
1004EXPORT_SYMBOL(generic_file_splice_write);
1005
1006static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1007              struct splice_desc *sd)
1008{
1009    int ret;
1010    void *data;
1011
1012    ret = buf->ops->confirm(pipe, buf);
1013    if (ret)
1014        return ret;
1015
1016    data = buf->ops->map(pipe, buf, 0);
1017    ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
1018    buf->ops->unmap(pipe, buf, data);
1019
1020    return ret;
1021}
1022
1023static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
1024                     struct file *out, loff_t *ppos,
1025                     size_t len, unsigned int flags)
1026{
1027    ssize_t ret;
1028
1029    ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
1030    if (ret > 0)
1031        *ppos += ret;
1032
1033    return ret;
1034}
1035
1036/**
1037 * generic_splice_sendpage - splice data from a pipe to a socket
1038 * @pipe: pipe to splice from
1039 * @out: socket to write to
1040 * @ppos: position in @out
1041 * @len: number of bytes to splice
1042 * @flags: splice modifier flags
1043 *
1044 * Description:
1045 * Will send @len bytes from the pipe to a network socket. No data copying
1046 * is involved.
1047 *
1048 */
1049ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
1050                loff_t *ppos, size_t len, unsigned int flags)
1051{
1052    return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
1053}
1054
1055EXPORT_SYMBOL(generic_splice_sendpage);
1056
1057/*
1058 * Attempt to initiate a splice from pipe to file.
1059 */
1060static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
1061               loff_t *ppos, size_t len, unsigned int flags)
1062{
1063    ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
1064                loff_t *, size_t, unsigned int);
1065    int ret;
1066
1067    if (unlikely(!(out->f_mode & FMODE_WRITE)))
1068        return -EBADF;
1069
1070    if (unlikely(out->f_flags & O_APPEND))
1071        return -EINVAL;
1072
1073    ret = rw_verify_area(WRITE, out, ppos, len);
1074    if (unlikely(ret < 0))
1075        return ret;
1076
1077    splice_write = out->f_op->splice_write;
1078    if (!splice_write)
1079        splice_write = default_file_splice_write;
1080
1081    return splice_write(pipe, out, ppos, len, flags);
1082}
1083
1084/*
1085 * Attempt to initiate a splice from a file to a pipe.
1086 */
1087static long do_splice_to(struct file *in, loff_t *ppos,
1088             struct pipe_inode_info *pipe, size_t len,
1089             unsigned int flags)
1090{
1091    ssize_t (*splice_read)(struct file *, loff_t *,
1092                   struct pipe_inode_info *, size_t, unsigned int);
1093    int ret;
1094
1095    if (unlikely(!(in->f_mode & FMODE_READ)))
1096        return -EBADF;
1097
1098    ret = rw_verify_area(READ, in, ppos, len);
1099    if (unlikely(ret < 0))
1100        return ret;
1101
1102    splice_read = in->f_op->splice_read;
1103    if (!splice_read)
1104        splice_read = default_file_splice_read;
1105
1106    return splice_read(in, ppos, pipe, len, flags);
1107}
1108
1109/**
1110 * splice_direct_to_actor - splices data directly between two non-pipes
1111 * @in: file to splice from
1112 * @sd: actor information on where to splice to
1113 * @actor: handles the data splicing
1114 *
1115 * Description:
1116 * This is a special case helper to splice directly between two
1117 * points, without requiring an explicit pipe. Internally an allocated
1118 * pipe is cached in the process, and reused during the lifetime of
1119 * that process.
1120 *
1121 */
1122ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
1123                   splice_direct_actor *actor)
1124{
1125    struct pipe_inode_info *pipe;
1126    long ret, bytes;
1127    umode_t i_mode;
1128    size_t len;
1129    int i, flags;
1130
1131    /*
1132     * We require the input being a regular file, as we don't want to
1133     * randomly drop data for eg socket -> socket splicing. Use the
1134     * piped splicing for that!
1135     */
1136    i_mode = in->f_path.dentry->d_inode->i_mode;
1137    if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
1138        return -EINVAL;
1139
1140    /*
1141     * neither in nor out is a pipe, setup an internal pipe attached to
1142     * 'out' and transfer the wanted data from 'in' to 'out' through that
1143     */
1144    pipe = current->splice_pipe;
1145    if (unlikely(!pipe)) {
1146        pipe = alloc_pipe_info(NULL);
1147        if (!pipe)
1148            return -ENOMEM;
1149
1150        /*
1151         * We don't have an immediate reader, but we'll read the stuff
1152         * out of the pipe right after the splice_to_pipe(). So set
1153         * PIPE_READERS appropriately.
1154         */
1155        pipe->readers = 1;
1156
1157        current->splice_pipe = pipe;
1158    }
1159
1160    /*
1161     * Do the splice.
1162     */
1163    ret = 0;
1164    bytes = 0;
1165    len = sd->total_len;
1166    flags = sd->flags;
1167
1168    /*
1169     * Don't block on output, we have to drain the direct pipe.
1170     */
1171    sd->flags &= ~SPLICE_F_NONBLOCK;
1172
1173    while (len) {
1174        size_t read_len;
1175        loff_t pos = sd->pos, prev_pos = pos;
1176
1177        ret = do_splice_to(in, &pos, pipe, len, flags);
1178        if (unlikely(ret <= 0))
1179            goto out_release;
1180
1181        read_len = ret;
1182        sd->total_len = read_len;
1183
1184        /*
1185         * NOTE: nonblocking mode only applies to the input. We
1186         * must not do the output in nonblocking mode as then we
1187         * could get stuck data in the internal pipe:
1188         */
1189        ret = actor(pipe, sd);
1190        if (unlikely(ret <= 0)) {
1191            sd->pos = prev_pos;
1192            goto out_release;
1193        }
1194
1195        bytes += ret;
1196        len -= ret;
1197        sd->pos = pos;
1198
1199        if (ret < read_len) {
1200            sd->pos = prev_pos + ret;
1201            goto out_release;
1202        }
1203    }
1204
1205done:
1206    pipe->nrbufs = pipe->curbuf = 0;
1207    file_accessed(in);
1208    return bytes;
1209
1210out_release:
1211    /*
1212     * If we did an incomplete transfer we must release
1213     * the pipe buffers in question:
1214     */
1215    for (i = 0; i < PIPE_BUFFERS; i++) {
1216        struct pipe_buffer *buf = pipe->bufs + i;
1217
1218        if (buf->ops) {
1219            buf->ops->release(pipe, buf);
1220            buf->ops = NULL;
1221        }
1222    }
1223
1224    if (!bytes)
1225        bytes = ret;
1226
1227    goto done;
1228}
1229EXPORT_SYMBOL(splice_direct_to_actor);
1230
1231static int direct_splice_actor(struct pipe_inode_info *pipe,
1232                   struct splice_desc *sd)
1233{
1234    struct file *file = sd->u.file;
1235
1236    return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags);
1237}
1238
1239/**
1240 * do_splice_direct - splices data directly between two files
1241 * @in: file to splice from
1242 * @ppos: input file offset
1243 * @out: file to splice to
1244 * @len: number of bytes to splice
1245 * @flags: splice modifier flags
1246 *
1247 * Description:
1248 * For use by do_sendfile(). splice can easily emulate sendfile, but
1249 * doing it in the application would incur an extra system call
1250 * (splice in + splice out, as compared to just sendfile()). So this helper
1251 * can splice directly through a process-private pipe.
1252 *
1253 */
1254long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1255              size_t len, unsigned int flags)
1256{
1257    struct splice_desc sd = {
1258        .len = len,
1259        .total_len = len,
1260        .flags = flags,
1261        .pos = *ppos,
1262        .u.file = out,
1263    };
1264    long ret;
1265
1266    ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1267    if (ret > 0)
1268        *ppos = sd.pos;
1269
1270    return ret;
1271}
1272
1273static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1274                   struct pipe_inode_info *opipe,
1275                   size_t len, unsigned int flags);
1276/*
1277 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1278 * location, so checking ->i_pipe is not enough to verify that this is a
1279 * pipe.
1280 */
1281static inline struct pipe_inode_info *pipe_info(struct inode *inode)
1282{
1283    if (S_ISFIFO(inode->i_mode))
1284        return inode->i_pipe;
1285
1286    return NULL;
1287}
1288
1289/*
1290 * Determine where to splice to/from.
1291 */
1292static long do_splice(struct file *in, loff_t __user *off_in,
1293              struct file *out, loff_t __user *off_out,
1294              size_t len, unsigned int flags)
1295{
1296    struct pipe_inode_info *ipipe;
1297    struct pipe_inode_info *opipe;
1298    loff_t offset, *off;
1299    long ret;
1300
1301    ipipe = pipe_info(in->f_path.dentry->d_inode);
1302    opipe = pipe_info(out->f_path.dentry->d_inode);
1303
1304    if (ipipe && opipe) {
1305        if (off_in || off_out)
1306            return -ESPIPE;
1307
1308        if (!(in->f_mode & FMODE_READ))
1309            return -EBADF;
1310
1311        if (!(out->f_mode & FMODE_WRITE))
1312            return -EBADF;
1313
1314        /* Splicing to self would be fun, but... */
1315        if (ipipe == opipe)
1316            return -EINVAL;
1317
1318        return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1319    }
1320
1321    if (ipipe) {
1322        if (off_in)
1323            return -ESPIPE;
1324        if (off_out) {
1325            if (out->f_op->llseek == no_llseek)
1326                return -EINVAL;
1327            if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1328                return -EFAULT;
1329            off = &offset;
1330        } else
1331            off = &out->f_pos;
1332
1333        ret = do_splice_from(ipipe, out, off, len, flags);
1334
1335        if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
1336            ret = -EFAULT;
1337
1338        return ret;
1339    }
1340
1341    if (opipe) {
1342        if (off_out)
1343            return -ESPIPE;
1344        if (off_in) {
1345            if (in->f_op->llseek == no_llseek)
1346                return -EINVAL;
1347            if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1348                return -EFAULT;
1349            off = &offset;
1350        } else
1351            off = &in->f_pos;
1352
1353        ret = do_splice_to(in, off, opipe, len, flags);
1354
1355        if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
1356            ret = -EFAULT;
1357
1358        return ret;
1359    }
1360
1361    return -EINVAL;
1362}
1363
1364/*
1365 * Map an iov into an array of pages and offset/length tupples. With the
1366 * partial_page structure, we can map several non-contiguous ranges into
1367 * our ones pages[] map instead of splitting that operation into pieces.
1368 * Could easily be exported as a generic helper for other users, in which
1369 * case one would probably want to add a 'max_nr_pages' parameter as well.
1370 */
1371static int get_iovec_page_array(const struct iovec __user *iov,
1372                unsigned int nr_vecs, struct page **pages,
1373                struct partial_page *partial, int aligned)
1374{
1375    int buffers = 0, error = 0;
1376
1377    while (nr_vecs) {
1378        unsigned long off, npages;
1379        struct iovec entry;
1380        void __user *base;
1381        size_t len;
1382        int i;
1383
1384        error = -EFAULT;
1385        if (copy_from_user(&entry, iov, sizeof(entry)))
1386            break;
1387
1388        base = entry.iov_base;
1389        len = entry.iov_len;
1390
1391        /*
1392         * Sanity check this iovec. 0 read succeeds.
1393         */
1394        error = 0;
1395        if (unlikely(!len))
1396            break;
1397        error = -EFAULT;
1398        if (!access_ok(VERIFY_READ, base, len))
1399            break;
1400
1401        /*
1402         * Get this base offset and number of pages, then map
1403         * in the user pages.
1404         */
1405        off = (unsigned long) base & ~PAGE_MASK;
1406
1407        /*
1408         * If asked for alignment, the offset must be zero and the
1409         * length a multiple of the PAGE_SIZE.
1410         */
1411        error = -EINVAL;
1412        if (aligned && (off || len & ~PAGE_MASK))
1413            break;
1414
1415        npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1416        if (npages > PIPE_BUFFERS - buffers)
1417            npages = PIPE_BUFFERS - buffers;
1418
1419        error = get_user_pages_fast((unsigned long)base, npages,
1420                    0, &pages[buffers]);
1421
1422        if (unlikely(error <= 0))
1423            break;
1424
1425        /*
1426         * Fill this contiguous range into the partial page map.
1427         */
1428        for (i = 0; i < error; i++) {
1429            const int plen = min_t(size_t, len, PAGE_SIZE - off);
1430
1431            partial[buffers].offset = off;
1432            partial[buffers].len = plen;
1433
1434            off = 0;
1435            len -= plen;
1436            buffers++;
1437        }
1438
1439        /*
1440         * We didn't complete this iov, stop here since it probably
1441         * means we have to move some of this into a pipe to
1442         * be able to continue.
1443         */
1444        if (len)
1445            break;
1446
1447        /*
1448         * Don't continue if we mapped fewer pages than we asked for,
1449         * or if we mapped the max number of pages that we have
1450         * room for.
1451         */
1452        if (error < npages || buffers == PIPE_BUFFERS)
1453            break;
1454
1455        nr_vecs--;
1456        iov++;
1457    }
1458
1459    if (buffers)
1460        return buffers;
1461
1462    return error;
1463}
1464
1465static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1466            struct splice_desc *sd)
1467{
1468    char *src;
1469    int ret;
1470
1471    ret = buf->ops->confirm(pipe, buf);
1472    if (unlikely(ret))
1473        return ret;
1474
1475    /*
1476     * See if we can use the atomic maps, by prefaulting in the
1477     * pages and doing an atomic copy
1478     */
1479    if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) {
1480        src = buf->ops->map(pipe, buf, 1);
1481        ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset,
1482                            sd->len);
1483        buf->ops->unmap(pipe, buf, src);
1484        if (!ret) {
1485            ret = sd->len;
1486            goto out;
1487        }
1488    }
1489
1490    /*
1491     * No dice, use slow non-atomic map and copy
1492      */
1493    src = buf->ops->map(pipe, buf, 0);
1494
1495    ret = sd->len;
1496    if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
1497        ret = -EFAULT;
1498
1499    buf->ops->unmap(pipe, buf, src);
1500out:
1501    if (ret > 0)
1502        sd->u.userptr += ret;
1503    return ret;
1504}
1505
1506/*
1507 * For lack of a better implementation, implement vmsplice() to userspace
1508 * as a simple copy of the pipes pages to the user iov.
1509 */
1510static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
1511                 unsigned long nr_segs, unsigned int flags)
1512{
1513    struct pipe_inode_info *pipe;
1514    struct splice_desc sd;
1515    ssize_t size;
1516    int error;
1517    long ret;
1518
1519    pipe = pipe_info(file->f_path.dentry->d_inode);
1520    if (!pipe)
1521        return -EBADF;
1522
1523    pipe_lock(pipe);
1524
1525    error = ret = 0;
1526    while (nr_segs) {
1527        void __user *base;
1528        size_t len;
1529
1530        /*
1531         * Get user address base and length for this iovec.
1532         */
1533        error = get_user(base, &iov->iov_base);
1534        if (unlikely(error))
1535            break;
1536        error = get_user(len, &iov->iov_len);
1537        if (unlikely(error))
1538            break;
1539
1540        /*
1541         * Sanity check this iovec. 0 read succeeds.
1542         */
1543        if (unlikely(!len))
1544            break;
1545        if (unlikely(!base)) {
1546            error = -EFAULT;
1547            break;
1548        }
1549
1550        if (unlikely(!access_ok(VERIFY_WRITE, base, len))) {
1551            error = -EFAULT;
1552            break;
1553        }
1554
1555        sd.len = 0;
1556        sd.total_len = len;
1557        sd.flags = flags;
1558        sd.u.userptr = base;
1559        sd.pos = 0;
1560
1561        size = __splice_from_pipe(pipe, &sd, pipe_to_user);
1562        if (size < 0) {
1563            if (!ret)
1564                ret = size;
1565
1566            break;
1567        }
1568
1569        ret += size;
1570
1571        if (size < len)
1572            break;
1573
1574        nr_segs--;
1575        iov++;
1576    }
1577
1578    pipe_unlock(pipe);
1579
1580    if (!ret)
1581        ret = error;
1582
1583    return ret;
1584}
1585
1586/*
1587 * vmsplice splices a user address range into a pipe. It can be thought of
1588 * as splice-from-memory, where the regular splice is splice-from-file (or
1589 * to file). In both cases the output is a pipe, naturally.
1590 */
1591static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1592                 unsigned long nr_segs, unsigned int flags)
1593{
1594    struct pipe_inode_info *pipe;
1595    struct page *pages[PIPE_BUFFERS];
1596    struct partial_page partial[PIPE_BUFFERS];
1597    struct splice_pipe_desc spd = {
1598        .pages = pages,
1599        .partial = partial,
1600        .flags = flags,
1601        .ops = &user_page_pipe_buf_ops,
1602        .spd_release = spd_release_page,
1603    };
1604
1605    pipe = pipe_info(file->f_path.dentry->d_inode);
1606    if (!pipe)
1607        return -EBADF;
1608
1609    spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
1610                        flags & SPLICE_F_GIFT);
1611    if (spd.nr_pages <= 0)
1612        return spd.nr_pages;
1613
1614    return splice_to_pipe(pipe, &spd);
1615}
1616
1617/*
1618 * Note that vmsplice only really supports true splicing _from_ user memory
1619 * to a pipe, not the other way around. Splicing from user memory is a simple
1620 * operation that can be supported without any funky alignment restrictions
1621 * or nasty vm tricks. We simply map in the user memory and fill them into
1622 * a pipe. The reverse isn't quite as easy, though. There are two possible
1623 * solutions for that:
1624 *
1625 * - memcpy() the data internally, at which point we might as well just
1626 * do a regular read() on the buffer anyway.
1627 * - Lots of nasty vm tricks, that are neither fast nor flexible (it
1628 * has restriction limitations on both ends of the pipe).
1629 *
1630 * Currently we punt and implement it as a normal copy, see pipe_to_user().
1631 *
1632 */
1633SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
1634        unsigned long, nr_segs, unsigned int, flags)
1635{
1636    struct file *file;
1637    long error;
1638    int fput;
1639
1640    if (unlikely(nr_segs > UIO_MAXIOV))
1641        return -EINVAL;
1642    else if (unlikely(!nr_segs))
1643        return 0;
1644
1645    error = -EBADF;
1646    file = fget_light(fd, &fput);
1647    if (file) {
1648        if (file->f_mode & FMODE_WRITE)
1649            error = vmsplice_to_pipe(file, iov, nr_segs, flags);
1650        else if (file->f_mode & FMODE_READ)
1651            error = vmsplice_to_user(file, iov, nr_segs, flags);
1652
1653        fput_light(file, fput);
1654    }
1655
1656    return error;
1657}
1658
1659SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1660        int, fd_out, loff_t __user *, off_out,
1661        size_t, len, unsigned int, flags)
1662{
1663    long error;
1664    struct file *in, *out;
1665    int fput_in, fput_out;
1666
1667    if (unlikely(!len))
1668        return 0;
1669
1670    error = -EBADF;
1671    in = fget_light(fd_in, &fput_in);
1672    if (in) {
1673        if (in->f_mode & FMODE_READ) {
1674            out = fget_light(fd_out, &fput_out);
1675            if (out) {
1676                if (out->f_mode & FMODE_WRITE)
1677                    error = do_splice(in, off_in,
1678                              out, off_out,
1679                              len, flags);
1680                fput_light(out, fput_out);
1681            }
1682        }
1683
1684        fput_light(in, fput_in);
1685    }
1686
1687    return error;
1688}
1689
1690/*
1691 * Make sure there's data to read. Wait for input if we can, otherwise
1692 * return an appropriate error.
1693 */
1694static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1695{
1696    int ret;
1697
1698    /*
1699     * Check ->nrbufs without the inode lock first. This function
1700     * is speculative anyways, so missing one is ok.
1701     */
1702    if (pipe->nrbufs)
1703        return 0;
1704
1705    ret = 0;
1706    pipe_lock(pipe);
1707
1708    while (!pipe->nrbufs) {
1709        if (signal_pending(current)) {
1710            ret = -ERESTARTSYS;
1711            break;
1712        }
1713        if (!pipe->writers)
1714            break;
1715        if (!pipe->waiting_writers) {
1716            if (flags & SPLICE_F_NONBLOCK) {
1717                ret = -EAGAIN;
1718                break;
1719            }
1720        }
1721        pipe_wait(pipe);
1722    }
1723
1724    pipe_unlock(pipe);
1725    return ret;
1726}
1727
1728/*
1729 * Make sure there's writeable room. Wait for room if we can, otherwise
1730 * return an appropriate error.
1731 */
1732static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1733{
1734    int ret;
1735
1736    /*
1737     * Check ->nrbufs without the inode lock first. This function
1738     * is speculative anyways, so missing one is ok.
1739     */
1740    if (pipe->nrbufs < PIPE_BUFFERS)
1741        return 0;
1742
1743    ret = 0;
1744    pipe_lock(pipe);
1745
1746    while (pipe->nrbufs >= PIPE_BUFFERS) {
1747        if (!pipe->readers) {
1748            send_sig(SIGPIPE, current, 0);
1749            ret = -EPIPE;
1750            break;
1751        }
1752        if (flags & SPLICE_F_NONBLOCK) {
1753            ret = -EAGAIN;
1754            break;
1755        }
1756        if (signal_pending(current)) {
1757            ret = -ERESTARTSYS;
1758            break;
1759        }
1760        pipe->waiting_writers++;
1761        pipe_wait(pipe);
1762        pipe->waiting_writers--;
1763    }
1764
1765    pipe_unlock(pipe);
1766    return ret;
1767}
1768
1769/*
1770 * Splice contents of ipipe to opipe.
1771 */
1772static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1773                   struct pipe_inode_info *opipe,
1774                   size_t len, unsigned int flags)
1775{
1776    struct pipe_buffer *ibuf, *obuf;
1777    int ret = 0, nbuf;
1778    bool input_wakeup = false;
1779
1780
1781retry:
1782    ret = ipipe_prep(ipipe, flags);
1783    if (ret)
1784        return ret;
1785
1786    ret = opipe_prep(opipe, flags);
1787    if (ret)
1788        return ret;
1789
1790    /*
1791     * Potential ABBA deadlock, work around it by ordering lock
1792     * grabbing by pipe info address. Otherwise two different processes
1793     * could deadlock (one doing tee from A -> B, the other from B -> A).
1794     */
1795    pipe_double_lock(ipipe, opipe);
1796
1797    do {
1798        if (!opipe->readers) {
1799            send_sig(SIGPIPE, current, 0);
1800            if (!ret)
1801                ret = -EPIPE;
1802            break;
1803        }
1804
1805        if (!ipipe->nrbufs && !ipipe->writers)
1806            break;
1807
1808        /*
1809         * Cannot make any progress, because either the input
1810         * pipe is empty or the output pipe is full.
1811         */
1812        if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) {
1813            /* Already processed some buffers, break */
1814            if (ret)
1815                break;
1816
1817            if (flags & SPLICE_F_NONBLOCK) {
1818                ret = -EAGAIN;
1819                break;
1820            }
1821
1822            /*
1823             * We raced with another reader/writer and haven't
1824             * managed to process any buffers. A zero return
1825             * value means EOF, so retry instead.
1826             */
1827            pipe_unlock(ipipe);
1828            pipe_unlock(opipe);
1829            goto retry;
1830        }
1831
1832        ibuf = ipipe->bufs + ipipe->curbuf;
1833        nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS;
1834        obuf = opipe->bufs + nbuf;
1835
1836        if (len >= ibuf->len) {
1837            /*
1838             * Simply move the whole buffer from ipipe to opipe
1839             */
1840            *obuf = *ibuf;
1841            ibuf->ops = NULL;
1842            opipe->nrbufs++;
1843            ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS;
1844            ipipe->nrbufs--;
1845            input_wakeup = true;
1846        } else {
1847            /*
1848             * Get a reference to this pipe buffer,
1849             * so we can copy the contents over.
1850             */
1851            ibuf->ops->get(ipipe, ibuf);
1852            *obuf = *ibuf;
1853
1854            /*
1855             * Don't inherit the gift flag, we need to
1856             * prevent multiple steals of this page.
1857             */
1858            obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1859
1860            obuf->len = len;
1861            opipe->nrbufs++;
1862            ibuf->offset += obuf->len;
1863            ibuf->len -= obuf->len;
1864        }
1865        ret += obuf->len;
1866        len -= obuf->len;
1867    } while (len);
1868
1869    pipe_unlock(ipipe);
1870    pipe_unlock(opipe);
1871
1872    /*
1873     * If we put data in the output pipe, wakeup any potential readers.
1874     */
1875    if (ret > 0) {
1876        smp_mb();
1877        if (waitqueue_active(&opipe->wait))
1878            wake_up_interruptible(&opipe->wait);
1879        kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
1880    }
1881    if (input_wakeup)
1882        wakeup_pipe_writers(ipipe);
1883
1884    return ret;
1885}
1886
1887/*
1888 * Link contents of ipipe to opipe.
1889 */
1890static int link_pipe(struct pipe_inode_info *ipipe,
1891             struct pipe_inode_info *opipe,
1892             size_t len, unsigned int flags)
1893{
1894    struct pipe_buffer *ibuf, *obuf;
1895    int ret = 0, i = 0, nbuf;
1896
1897    /*
1898     * Potential ABBA deadlock, work around it by ordering lock
1899     * grabbing by pipe info address. Otherwise two different processes
1900     * could deadlock (one doing tee from A -> B, the other from B -> A).
1901     */
1902    pipe_double_lock(ipipe, opipe);
1903
1904    do {
1905        if (!opipe->readers) {
1906            send_sig(SIGPIPE, current, 0);
1907            if (!ret)
1908                ret = -EPIPE;
1909            break;
1910        }
1911
1912        /*
1913         * If we have iterated all input buffers or ran out of
1914         * output room, break.
1915         */
1916        if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
1917            break;
1918
1919        ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
1920        nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
1921
1922        /*
1923         * Get a reference to this pipe buffer,
1924         * so we can copy the contents over.
1925         */
1926        ibuf->ops->get(ipipe, ibuf);
1927
1928        obuf = opipe->bufs + nbuf;
1929        *obuf = *ibuf;
1930
1931        /*
1932         * Don't inherit the gift flag, we need to
1933         * prevent multiple steals of this page.
1934         */
1935        obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1936
1937        if (obuf->len > len)
1938            obuf->len = len;
1939
1940        opipe->nrbufs++;
1941        ret += obuf->len;
1942        len -= obuf->len;
1943        i++;
1944    } while (len);
1945
1946    /*
1947     * return EAGAIN if we have the potential of some data in the
1948     * future, otherwise just return 0
1949     */
1950    if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
1951        ret = -EAGAIN;
1952
1953    pipe_unlock(ipipe);
1954    pipe_unlock(opipe);
1955
1956    /*
1957     * If we put data in the output pipe, wakeup any potential readers.
1958     */
1959    if (ret > 0) {
1960        smp_mb();
1961        if (waitqueue_active(&opipe->wait))
1962            wake_up_interruptible(&opipe->wait);
1963        kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
1964    }
1965
1966    return ret;
1967}
1968
1969/*
1970 * This is a tee(1) implementation that works on pipes. It doesn't copy
1971 * any data, it simply references the 'in' pages on the 'out' pipe.
1972 * The 'flags' used are the SPLICE_F_* variants, currently the only
1973 * applicable one is SPLICE_F_NONBLOCK.
1974 */
1975static long do_tee(struct file *in, struct file *out, size_t len,
1976           unsigned int flags)
1977{
1978    struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode);
1979    struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode);
1980    int ret = -EINVAL;
1981
1982    /*
1983     * Duplicate the contents of ipipe to opipe without actually
1984     * copying the data.
1985     */
1986    if (ipipe && opipe && ipipe != opipe) {
1987        /*
1988         * Keep going, unless we encounter an error. The ipipe/opipe
1989         * ordering doesn't really matter.
1990         */
1991        ret = ipipe_prep(ipipe, flags);
1992        if (!ret) {
1993            ret = opipe_prep(opipe, flags);
1994            if (!ret)
1995                ret = link_pipe(ipipe, opipe, len, flags);
1996        }
1997    }
1998
1999    return ret;
2000}
2001
2002SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
2003{
2004    struct file *in;
2005    int error, fput_in;
2006
2007    if (unlikely(!len))
2008        return 0;
2009
2010    error = -EBADF;
2011    in = fget_light(fdin, &fput_in);
2012    if (in) {
2013        if (in->f_mode & FMODE_READ) {
2014            int fput_out;
2015            struct file *out = fget_light(fdout, &fput_out);
2016
2017            if (out) {
2018                if (out->f_mode & FMODE_WRITE)
2019                    error = do_tee(in, out, len, flags);
2020                fput_light(out, fput_out);
2021            }
2022        }
2023         fput_light(in, fput_in);
2024     }
2025
2026    return error;
2027}
2028

Archive Download this file



interactive