Root/kernel/futex.c

1/*
2 * Fast Userspace Mutexes (which I call "Futexes!").
3 * (C) Rusty Russell, IBM 2002
4 *
5 * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
6 * (C) Copyright 2003 Red Hat Inc, All Rights Reserved
7 *
8 * Removed page pinning, fix privately mapped COW pages and other cleanups
9 * (C) Copyright 2003, 2004 Jamie Lokier
10 *
11 * Robust futex support started by Ingo Molnar
12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
14 *
15 * PI-futex support started by Ingo Molnar and Thomas Gleixner
16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18 *
19 * PRIVATE futexes by Eric Dumazet
20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
21 *
22 * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
23 * Copyright (C) IBM Corporation, 2009
24 * Thanks to Thomas Gleixner for conceptual design and careful reviews.
25 *
26 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
27 * enough at me, Linus for the original (flawed) idea, Matthew
28 * Kirkwood for proof-of-concept implementation.
29 *
30 * "The futexes are also cursed."
31 * "But they come in a choice of three flavours!"
32 *
33 * This program is free software; you can redistribute it and/or modify
34 * it under the terms of the GNU General Public License as published by
35 * the Free Software Foundation; either version 2 of the License, or
36 * (at your option) any later version.
37 *
38 * This program is distributed in the hope that it will be useful,
39 * but WITHOUT ANY WARRANTY; without even the implied warranty of
40 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
41 * GNU General Public License for more details.
42 *
43 * You should have received a copy of the GNU General Public License
44 * along with this program; if not, write to the Free Software
45 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
46 */
47#include <linux/slab.h>
48#include <linux/poll.h>
49#include <linux/fs.h>
50#include <linux/file.h>
51#include <linux/jhash.h>
52#include <linux/init.h>
53#include <linux/futex.h>
54#include <linux/mount.h>
55#include <linux/pagemap.h>
56#include <linux/syscalls.h>
57#include <linux/signal.h>
58#include <linux/export.h>
59#include <linux/magic.h>
60#include <linux/pid.h>
61#include <linux/nsproxy.h>
62#include <linux/ptrace.h>
63
64#include <asm/futex.h>
65
66#include "rtmutex_common.h"
67
68int __read_mostly futex_cmpxchg_enabled;
69
70#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
71
72/*
73 * Futex flags used to encode options to functions and preserve them across
74 * restarts.
75 */
76#define FLAGS_SHARED 0x01
77#define FLAGS_CLOCKRT 0x02
78#define FLAGS_HAS_TIMEOUT 0x04
79
80/*
81 * Priority Inheritance state:
82 */
83struct futex_pi_state {
84    /*
85     * list of 'owned' pi_state instances - these have to be
86     * cleaned up in do_exit() if the task exits prematurely:
87     */
88    struct list_head list;
89
90    /*
91     * The PI object:
92     */
93    struct rt_mutex pi_mutex;
94
95    struct task_struct *owner;
96    atomic_t refcount;
97
98    union futex_key key;
99};
100
101/**
102 * struct futex_q - The hashed futex queue entry, one per waiting task
103 * @list: priority-sorted list of tasks waiting on this futex
104 * @task: the task waiting on the futex
105 * @lock_ptr: the hash bucket lock
106 * @key: the key the futex is hashed on
107 * @pi_state: optional priority inheritance state
108 * @rt_waiter: rt_waiter storage for use with requeue_pi
109 * @requeue_pi_key: the requeue_pi target futex key
110 * @bitset: bitset for the optional bitmasked wakeup
111 *
112 * We use this hashed waitqueue, instead of a normal wait_queue_t, so
113 * we can wake only the relevant ones (hashed queues may be shared).
114 *
115 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
116 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
117 * The order of wakeup is always to make the first condition true, then
118 * the second.
119 *
120 * PI futexes are typically woken before they are removed from the hash list via
121 * the rt_mutex code. See unqueue_me_pi().
122 */
123struct futex_q {
124    struct plist_node list;
125
126    struct task_struct *task;
127    spinlock_t *lock_ptr;
128    union futex_key key;
129    struct futex_pi_state *pi_state;
130    struct rt_mutex_waiter *rt_waiter;
131    union futex_key *requeue_pi_key;
132    u32 bitset;
133};
134
135static const struct futex_q futex_q_init = {
136    /* list gets initialized in queue_me()*/
137    .key = FUTEX_KEY_INIT,
138    .bitset = FUTEX_BITSET_MATCH_ANY
139};
140
141/*
142 * Hash buckets are shared by all the futex_keys that hash to the same
143 * location. Each key may have multiple futex_q structures, one for each task
144 * waiting on a futex.
145 */
146struct futex_hash_bucket {
147    spinlock_t lock;
148    struct plist_head chain;
149};
150
151static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
152
153/*
154 * We hash on the keys returned from get_futex_key (see below).
155 */
156static struct futex_hash_bucket *hash_futex(union futex_key *key)
157{
158    u32 hash = jhash2((u32*)&key->both.word,
159              (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
160              key->both.offset);
161    return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
162}
163
164/*
165 * Return 1 if two futex_keys are equal, 0 otherwise.
166 */
167static inline int match_futex(union futex_key *key1, union futex_key *key2)
168{
169    return (key1 && key2
170        && key1->both.word == key2->both.word
171        && key1->both.ptr == key2->both.ptr
172        && key1->both.offset == key2->both.offset);
173}
174
175/*
176 * Take a reference to the resource addressed by a key.
177 * Can be called while holding spinlocks.
178 *
179 */
180static void get_futex_key_refs(union futex_key *key)
181{
182    if (!key->both.ptr)
183        return;
184
185    switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
186    case FUT_OFF_INODE:
187        ihold(key->shared.inode);
188        break;
189    case FUT_OFF_MMSHARED:
190        atomic_inc(&key->private.mm->mm_count);
191        break;
192    }
193}
194
195/*
196 * Drop a reference to the resource addressed by a key.
197 * The hash bucket spinlock must not be held.
198 */
199static void drop_futex_key_refs(union futex_key *key)
200{
201    if (!key->both.ptr) {
202        /* If we're here then we tried to put a key we failed to get */
203        WARN_ON_ONCE(1);
204        return;
205    }
206
207    switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
208    case FUT_OFF_INODE:
209        iput(key->shared.inode);
210        break;
211    case FUT_OFF_MMSHARED:
212        mmdrop(key->private.mm);
213        break;
214    }
215}
216
217/**
218 * get_futex_key() - Get parameters which are the keys for a futex
219 * @uaddr: virtual address of the futex
220 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
221 * @key: address where result is stored.
222 * @rw: mapping needs to be read/write (values: VERIFY_READ,
223 * VERIFY_WRITE)
224 *
225 * Returns a negative error code or 0
226 * The key words are stored in *key on success.
227 *
228 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
229 * offset_within_page). For private mappings, it's (uaddr, current->mm).
230 * We can usually work out the index without swapping in the page.
231 *
232 * lock_page() might sleep, the caller should not hold a spinlock.
233 */
234static int
235get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
236{
237    unsigned long address = (unsigned long)uaddr;
238    struct mm_struct *mm = current->mm;
239    struct page *page, *page_head;
240    int err, ro = 0;
241
242    /*
243     * The futex address must be "naturally" aligned.
244     */
245    key->both.offset = address % PAGE_SIZE;
246    if (unlikely((address % sizeof(u32)) != 0))
247        return -EINVAL;
248    address -= key->both.offset;
249
250    /*
251     * PROCESS_PRIVATE futexes are fast.
252     * As the mm cannot disappear under us and the 'key' only needs
253     * virtual address, we dont even have to find the underlying vma.
254     * Note : We do have to check 'uaddr' is a valid user address,
255     * but access_ok() should be faster than find_vma()
256     */
257    if (!fshared) {
258        if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
259            return -EFAULT;
260        key->private.mm = mm;
261        key->private.address = address;
262        get_futex_key_refs(key);
263        return 0;
264    }
265
266again:
267    err = get_user_pages_fast(address, 1, 1, &page);
268    /*
269     * If write access is not required (eg. FUTEX_WAIT), try
270     * and get read-only access.
271     */
272    if (err == -EFAULT && rw == VERIFY_READ) {
273        err = get_user_pages_fast(address, 1, 0, &page);
274        ro = 1;
275    }
276    if (err < 0)
277        return err;
278    else
279        err = 0;
280
281#ifdef CONFIG_TRANSPARENT_HUGEPAGE
282    page_head = page;
283    if (unlikely(PageTail(page))) {
284        put_page(page);
285        /* serialize against __split_huge_page_splitting() */
286        local_irq_disable();
287        if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
288            page_head = compound_head(page);
289            /*
290             * page_head is valid pointer but we must pin
291             * it before taking the PG_lock and/or
292             * PG_compound_lock. The moment we re-enable
293             * irqs __split_huge_page_splitting() can
294             * return and the head page can be freed from
295             * under us. We can't take the PG_lock and/or
296             * PG_compound_lock on a page that could be
297             * freed from under us.
298             */
299            if (page != page_head) {
300                get_page(page_head);
301                put_page(page);
302            }
303            local_irq_enable();
304        } else {
305            local_irq_enable();
306            goto again;
307        }
308    }
309#else
310    page_head = compound_head(page);
311    if (page != page_head) {
312        get_page(page_head);
313        put_page(page);
314    }
315#endif
316
317    lock_page(page_head);
318
319    /*
320     * If page_head->mapping is NULL, then it cannot be a PageAnon
321     * page; but it might be the ZERO_PAGE or in the gate area or
322     * in a special mapping (all cases which we are happy to fail);
323     * or it may have been a good file page when get_user_pages_fast
324     * found it, but truncated or holepunched or subjected to
325     * invalidate_complete_page2 before we got the page lock (also
326     * cases which we are happy to fail). And we hold a reference,
327     * so refcount care in invalidate_complete_page's remove_mapping
328     * prevents drop_caches from setting mapping to NULL beneath us.
329     *
330     * The case we do have to guard against is when memory pressure made
331     * shmem_writepage move it from filecache to swapcache beneath us:
332     * an unlikely race, but we do need to retry for page_head->mapping.
333     */
334    if (!page_head->mapping) {
335        int shmem_swizzled = PageSwapCache(page_head);
336        unlock_page(page_head);
337        put_page(page_head);
338        if (shmem_swizzled)
339            goto again;
340        return -EFAULT;
341    }
342
343    /*
344     * Private mappings are handled in a simple way.
345     *
346     * NOTE: When userspace waits on a MAP_SHARED mapping, even if
347     * it's a read-only handle, it's expected that futexes attach to
348     * the object not the particular process.
349     */
350    if (PageAnon(page_head)) {
351        /*
352         * A RO anonymous page will never change and thus doesn't make
353         * sense for futex operations.
354         */
355        if (ro) {
356            err = -EFAULT;
357            goto out;
358        }
359
360        key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
361        key->private.mm = mm;
362        key->private.address = address;
363    } else {
364        key->both.offset |= FUT_OFF_INODE; /* inode-based key */
365        key->shared.inode = page_head->mapping->host;
366        key->shared.pgoff = page_head->index;
367    }
368
369    get_futex_key_refs(key);
370
371out:
372    unlock_page(page_head);
373    put_page(page_head);
374    return err;
375}
376
377static inline void put_futex_key(union futex_key *key)
378{
379    drop_futex_key_refs(key);
380}
381
382/**
383 * fault_in_user_writeable() - Fault in user address and verify RW access
384 * @uaddr: pointer to faulting user space address
385 *
386 * Slow path to fixup the fault we just took in the atomic write
387 * access to @uaddr.
388 *
389 * We have no generic implementation of a non-destructive write to the
390 * user address. We know that we faulted in the atomic pagefault
391 * disabled section so we can as well avoid the #PF overhead by
392 * calling get_user_pages() right away.
393 */
394static int fault_in_user_writeable(u32 __user *uaddr)
395{
396    struct mm_struct *mm = current->mm;
397    int ret;
398
399    down_read(&mm->mmap_sem);
400    ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
401                   FAULT_FLAG_WRITE);
402    up_read(&mm->mmap_sem);
403
404    return ret < 0 ? ret : 0;
405}
406
407/**
408 * futex_top_waiter() - Return the highest priority waiter on a futex
409 * @hb: the hash bucket the futex_q's reside in
410 * @key: the futex key (to distinguish it from other futex futex_q's)
411 *
412 * Must be called with the hb lock held.
413 */
414static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
415                    union futex_key *key)
416{
417    struct futex_q *this;
418
419    plist_for_each_entry(this, &hb->chain, list) {
420        if (match_futex(&this->key, key))
421            return this;
422    }
423    return NULL;
424}
425
426static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
427                      u32 uval, u32 newval)
428{
429    int ret;
430
431    pagefault_disable();
432    ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
433    pagefault_enable();
434
435    return ret;
436}
437
438static int get_futex_value_locked(u32 *dest, u32 __user *from)
439{
440    int ret;
441
442    pagefault_disable();
443    ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
444    pagefault_enable();
445
446    return ret ? -EFAULT : 0;
447}
448
449
450/*
451 * PI code:
452 */
453static int refill_pi_state_cache(void)
454{
455    struct futex_pi_state *pi_state;
456
457    if (likely(current->pi_state_cache))
458        return 0;
459
460    pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
461
462    if (!pi_state)
463        return -ENOMEM;
464
465    INIT_LIST_HEAD(&pi_state->list);
466    /* pi_mutex gets initialized later */
467    pi_state->owner = NULL;
468    atomic_set(&pi_state->refcount, 1);
469    pi_state->key = FUTEX_KEY_INIT;
470
471    current->pi_state_cache = pi_state;
472
473    return 0;
474}
475
476static struct futex_pi_state * alloc_pi_state(void)
477{
478    struct futex_pi_state *pi_state = current->pi_state_cache;
479
480    WARN_ON(!pi_state);
481    current->pi_state_cache = NULL;
482
483    return pi_state;
484}
485
486static void free_pi_state(struct futex_pi_state *pi_state)
487{
488    if (!atomic_dec_and_test(&pi_state->refcount))
489        return;
490
491    /*
492     * If pi_state->owner is NULL, the owner is most probably dying
493     * and has cleaned up the pi_state already
494     */
495    if (pi_state->owner) {
496        raw_spin_lock_irq(&pi_state->owner->pi_lock);
497        list_del_init(&pi_state->list);
498        raw_spin_unlock_irq(&pi_state->owner->pi_lock);
499
500        rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
501    }
502
503    if (current->pi_state_cache)
504        kfree(pi_state);
505    else {
506        /*
507         * pi_state->list is already empty.
508         * clear pi_state->owner.
509         * refcount is at 0 - put it back to 1.
510         */
511        pi_state->owner = NULL;
512        atomic_set(&pi_state->refcount, 1);
513        current->pi_state_cache = pi_state;
514    }
515}
516
517/*
518 * Look up the task based on what TID userspace gave us.
519 * We dont trust it.
520 */
521static struct task_struct * futex_find_get_task(pid_t pid)
522{
523    struct task_struct *p;
524
525    rcu_read_lock();
526    p = find_task_by_vpid(pid);
527    if (p)
528        get_task_struct(p);
529
530    rcu_read_unlock();
531
532    return p;
533}
534
535/*
536 * This task is holding PI mutexes at exit time => bad.
537 * Kernel cleans up PI-state, but userspace is likely hosed.
538 * (Robust-futex cleanup is separate and might save the day for userspace.)
539 */
540void exit_pi_state_list(struct task_struct *curr)
541{
542    struct list_head *next, *head = &curr->pi_state_list;
543    struct futex_pi_state *pi_state;
544    struct futex_hash_bucket *hb;
545    union futex_key key = FUTEX_KEY_INIT;
546
547    if (!futex_cmpxchg_enabled)
548        return;
549    /*
550     * We are a ZOMBIE and nobody can enqueue itself on
551     * pi_state_list anymore, but we have to be careful
552     * versus waiters unqueueing themselves:
553     */
554    raw_spin_lock_irq(&curr->pi_lock);
555    while (!list_empty(head)) {
556
557        next = head->next;
558        pi_state = list_entry(next, struct futex_pi_state, list);
559        key = pi_state->key;
560        hb = hash_futex(&key);
561        raw_spin_unlock_irq(&curr->pi_lock);
562
563        spin_lock(&hb->lock);
564
565        raw_spin_lock_irq(&curr->pi_lock);
566        /*
567         * We dropped the pi-lock, so re-check whether this
568         * task still owns the PI-state:
569         */
570        if (head->next != next) {
571            spin_unlock(&hb->lock);
572            continue;
573        }
574
575        WARN_ON(pi_state->owner != curr);
576        WARN_ON(list_empty(&pi_state->list));
577        list_del_init(&pi_state->list);
578        pi_state->owner = NULL;
579        raw_spin_unlock_irq(&curr->pi_lock);
580
581        rt_mutex_unlock(&pi_state->pi_mutex);
582
583        spin_unlock(&hb->lock);
584
585        raw_spin_lock_irq(&curr->pi_lock);
586    }
587    raw_spin_unlock_irq(&curr->pi_lock);
588}
589
590static int
591lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
592        union futex_key *key, struct futex_pi_state **ps)
593{
594    struct futex_pi_state *pi_state = NULL;
595    struct futex_q *this, *next;
596    struct plist_head *head;
597    struct task_struct *p;
598    pid_t pid = uval & FUTEX_TID_MASK;
599
600    head = &hb->chain;
601
602    plist_for_each_entry_safe(this, next, head, list) {
603        if (match_futex(&this->key, key)) {
604            /*
605             * Another waiter already exists - bump up
606             * the refcount and return its pi_state:
607             */
608            pi_state = this->pi_state;
609            /*
610             * Userspace might have messed up non-PI and PI futexes
611             */
612            if (unlikely(!pi_state))
613                return -EINVAL;
614
615            WARN_ON(!atomic_read(&pi_state->refcount));
616
617            /*
618             * When pi_state->owner is NULL then the owner died
619             * and another waiter is on the fly. pi_state->owner
620             * is fixed up by the task which acquires
621             * pi_state->rt_mutex.
622             *
623             * We do not check for pid == 0 which can happen when
624             * the owner died and robust_list_exit() cleared the
625             * TID.
626             */
627            if (pid && pi_state->owner) {
628                /*
629                 * Bail out if user space manipulated the
630                 * futex value.
631                 */
632                if (pid != task_pid_vnr(pi_state->owner))
633                    return -EINVAL;
634            }
635
636            atomic_inc(&pi_state->refcount);
637            *ps = pi_state;
638
639            return 0;
640        }
641    }
642
643    /*
644     * We are the first waiter - try to look up the real owner and attach
645     * the new pi_state to it, but bail out when TID = 0
646     */
647    if (!pid)
648        return -ESRCH;
649    p = futex_find_get_task(pid);
650    if (!p)
651        return -ESRCH;
652
653    /*
654     * We need to look at the task state flags to figure out,
655     * whether the task is exiting. To protect against the do_exit
656     * change of the task flags, we do this protected by
657     * p->pi_lock:
658     */
659    raw_spin_lock_irq(&p->pi_lock);
660    if (unlikely(p->flags & PF_EXITING)) {
661        /*
662         * The task is on the way out. When PF_EXITPIDONE is
663         * set, we know that the task has finished the
664         * cleanup:
665         */
666        int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
667
668        raw_spin_unlock_irq(&p->pi_lock);
669        put_task_struct(p);
670        return ret;
671    }
672
673    pi_state = alloc_pi_state();
674
675    /*
676     * Initialize the pi_mutex in locked state and make 'p'
677     * the owner of it:
678     */
679    rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
680
681    /* Store the key for possible exit cleanups: */
682    pi_state->key = *key;
683
684    WARN_ON(!list_empty(&pi_state->list));
685    list_add(&pi_state->list, &p->pi_state_list);
686    pi_state->owner = p;
687    raw_spin_unlock_irq(&p->pi_lock);
688
689    put_task_struct(p);
690
691    *ps = pi_state;
692
693    return 0;
694}
695
696/**
697 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
698 * @uaddr: the pi futex user address
699 * @hb: the pi futex hash bucket
700 * @key: the futex key associated with uaddr and hb
701 * @ps: the pi_state pointer where we store the result of the
702 * lookup
703 * @task: the task to perform the atomic lock work for. This will
704 * be "current" except in the case of requeue pi.
705 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
706 *
707 * Returns:
708 * 0 - ready to wait
709 * 1 - acquired the lock
710 * <0 - error
711 *
712 * The hb->lock and futex_key refs shall be held by the caller.
713 */
714static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
715                union futex_key *key,
716                struct futex_pi_state **ps,
717                struct task_struct *task, int set_waiters)
718{
719    int lock_taken, ret, ownerdied = 0;
720    u32 uval, newval, curval, vpid = task_pid_vnr(task);
721
722retry:
723    ret = lock_taken = 0;
724
725    /*
726     * To avoid races, we attempt to take the lock here again
727     * (by doing a 0 -> TID atomic cmpxchg), while holding all
728     * the locks. It will most likely not succeed.
729     */
730    newval = vpid;
731    if (set_waiters)
732        newval |= FUTEX_WAITERS;
733
734    if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
735        return -EFAULT;
736
737    /*
738     * Detect deadlocks.
739     */
740    if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
741        return -EDEADLK;
742
743    /*
744     * Surprise - we got the lock. Just return to userspace:
745     */
746    if (unlikely(!curval))
747        return 1;
748
749    uval = curval;
750
751    /*
752     * Set the FUTEX_WAITERS flag, so the owner will know it has someone
753     * to wake at the next unlock.
754     */
755    newval = curval | FUTEX_WAITERS;
756
757    /*
758     * There are two cases, where a futex might have no owner (the
759     * owner TID is 0): OWNER_DIED. We take over the futex in this
760     * case. We also do an unconditional take over, when the owner
761     * of the futex died.
762     *
763     * This is safe as we are protected by the hash bucket lock !
764     */
765    if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
766        /* Keep the OWNER_DIED bit */
767        newval = (curval & ~FUTEX_TID_MASK) | vpid;
768        ownerdied = 0;
769        lock_taken = 1;
770    }
771
772    if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
773        return -EFAULT;
774    if (unlikely(curval != uval))
775        goto retry;
776
777    /*
778     * We took the lock due to owner died take over.
779     */
780    if (unlikely(lock_taken))
781        return 1;
782
783    /*
784     * We dont have the lock. Look up the PI state (or create it if
785     * we are the first waiter):
786     */
787    ret = lookup_pi_state(uval, hb, key, ps);
788
789    if (unlikely(ret)) {
790        switch (ret) {
791        case -ESRCH:
792            /*
793             * No owner found for this futex. Check if the
794             * OWNER_DIED bit is set to figure out whether
795             * this is a robust futex or not.
796             */
797            if (get_futex_value_locked(&curval, uaddr))
798                return -EFAULT;
799
800            /*
801             * We simply start over in case of a robust
802             * futex. The code above will take the futex
803             * and return happy.
804             */
805            if (curval & FUTEX_OWNER_DIED) {
806                ownerdied = 1;
807                goto retry;
808            }
809        default:
810            break;
811        }
812    }
813
814    return ret;
815}
816
817/**
818 * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
819 * @q: The futex_q to unqueue
820 *
821 * The q->lock_ptr must not be NULL and must be held by the caller.
822 */
823static void __unqueue_futex(struct futex_q *q)
824{
825    struct futex_hash_bucket *hb;
826
827    if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
828        || WARN_ON(plist_node_empty(&q->list)))
829        return;
830
831    hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
832    plist_del(&q->list, &hb->chain);
833}
834
835/*
836 * The hash bucket lock must be held when this is called.
837 * Afterwards, the futex_q must not be accessed.
838 */
839static void wake_futex(struct futex_q *q)
840{
841    struct task_struct *p = q->task;
842
843    /*
844     * We set q->lock_ptr = NULL _before_ we wake up the task. If
845     * a non-futex wake up happens on another CPU then the task
846     * might exit and p would dereference a non-existing task
847     * struct. Prevent this by holding a reference on p across the
848     * wake up.
849     */
850    get_task_struct(p);
851
852    __unqueue_futex(q);
853    /*
854     * The waiting task can free the futex_q as soon as
855     * q->lock_ptr = NULL is written, without taking any locks. A
856     * memory barrier is required here to prevent the following
857     * store to lock_ptr from getting ahead of the plist_del.
858     */
859    smp_wmb();
860    q->lock_ptr = NULL;
861
862    wake_up_state(p, TASK_NORMAL);
863    put_task_struct(p);
864}
865
866static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
867{
868    struct task_struct *new_owner;
869    struct futex_pi_state *pi_state = this->pi_state;
870    u32 uninitialized_var(curval), newval;
871
872    if (!pi_state)
873        return -EINVAL;
874
875    /*
876     * If current does not own the pi_state then the futex is
877     * inconsistent and user space fiddled with the futex value.
878     */
879    if (pi_state->owner != current)
880        return -EINVAL;
881
882    raw_spin_lock(&pi_state->pi_mutex.wait_lock);
883    new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
884
885    /*
886     * It is possible that the next waiter (the one that brought
887     * this owner to the kernel) timed out and is no longer
888     * waiting on the lock.
889     */
890    if (!new_owner)
891        new_owner = this->task;
892
893    /*
894     * We pass it to the next owner. (The WAITERS bit is always
895     * kept enabled while there is PI state around. We must also
896     * preserve the owner died bit.)
897     */
898    if (!(uval & FUTEX_OWNER_DIED)) {
899        int ret = 0;
900
901        newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
902
903        if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
904            ret = -EFAULT;
905        else if (curval != uval)
906            ret = -EINVAL;
907        if (ret) {
908            raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
909            return ret;
910        }
911    }
912
913    raw_spin_lock_irq(&pi_state->owner->pi_lock);
914    WARN_ON(list_empty(&pi_state->list));
915    list_del_init(&pi_state->list);
916    raw_spin_unlock_irq(&pi_state->owner->pi_lock);
917
918    raw_spin_lock_irq(&new_owner->pi_lock);
919    WARN_ON(!list_empty(&pi_state->list));
920    list_add(&pi_state->list, &new_owner->pi_state_list);
921    pi_state->owner = new_owner;
922    raw_spin_unlock_irq(&new_owner->pi_lock);
923
924    raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
925    rt_mutex_unlock(&pi_state->pi_mutex);
926
927    return 0;
928}
929
930static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
931{
932    u32 uninitialized_var(oldval);
933
934    /*
935     * There is no waiter, so we unlock the futex. The owner died
936     * bit has not to be preserved here. We are the owner:
937     */
938    if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
939        return -EFAULT;
940    if (oldval != uval)
941        return -EAGAIN;
942
943    return 0;
944}
945
946/*
947 * Express the locking dependencies for lockdep:
948 */
949static inline void
950double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
951{
952    if (hb1 <= hb2) {
953        spin_lock(&hb1->lock);
954        if (hb1 < hb2)
955            spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
956    } else { /* hb1 > hb2 */
957        spin_lock(&hb2->lock);
958        spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
959    }
960}
961
962static inline void
963double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
964{
965    spin_unlock(&hb1->lock);
966    if (hb1 != hb2)
967        spin_unlock(&hb2->lock);
968}
969
970/*
971 * Wake up waiters matching bitset queued on this futex (uaddr).
972 */
973static int
974futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
975{
976    struct futex_hash_bucket *hb;
977    struct futex_q *this, *next;
978    struct plist_head *head;
979    union futex_key key = FUTEX_KEY_INIT;
980    int ret;
981
982    if (!bitset)
983        return -EINVAL;
984
985    ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ);
986    if (unlikely(ret != 0))
987        goto out;
988
989    hb = hash_futex(&key);
990    spin_lock(&hb->lock);
991    head = &hb->chain;
992
993    plist_for_each_entry_safe(this, next, head, list) {
994        if (match_futex (&this->key, &key)) {
995            if (this->pi_state || this->rt_waiter) {
996                ret = -EINVAL;
997                break;
998            }
999
1000            /* Check if one of the bits is set in both bitsets */
1001            if (!(this->bitset & bitset))
1002                continue;
1003
1004            wake_futex(this);
1005            if (++ret >= nr_wake)
1006                break;
1007        }
1008    }
1009
1010    spin_unlock(&hb->lock);
1011    put_futex_key(&key);
1012out:
1013    return ret;
1014}
1015
1016/*
1017 * Wake up all waiters hashed on the physical page that is mapped
1018 * to this virtual address:
1019 */
1020static int
1021futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
1022          int nr_wake, int nr_wake2, int op)
1023{
1024    union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1025    struct futex_hash_bucket *hb1, *hb2;
1026    struct plist_head *head;
1027    struct futex_q *this, *next;
1028    int ret, op_ret;
1029
1030retry:
1031    ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1032    if (unlikely(ret != 0))
1033        goto out;
1034    ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
1035    if (unlikely(ret != 0))
1036        goto out_put_key1;
1037
1038    hb1 = hash_futex(&key1);
1039    hb2 = hash_futex(&key2);
1040
1041retry_private:
1042    double_lock_hb(hb1, hb2);
1043    op_ret = futex_atomic_op_inuser(op, uaddr2);
1044    if (unlikely(op_ret < 0)) {
1045
1046        double_unlock_hb(hb1, hb2);
1047
1048#ifndef CONFIG_MMU
1049        /*
1050         * we don't get EFAULT from MMU faults if we don't have an MMU,
1051         * but we might get them from range checking
1052         */
1053        ret = op_ret;
1054        goto out_put_keys;
1055#endif
1056
1057        if (unlikely(op_ret != -EFAULT)) {
1058            ret = op_ret;
1059            goto out_put_keys;
1060        }
1061
1062        ret = fault_in_user_writeable(uaddr2);
1063        if (ret)
1064            goto out_put_keys;
1065
1066        if (!(flags & FLAGS_SHARED))
1067            goto retry_private;
1068
1069        put_futex_key(&key2);
1070        put_futex_key(&key1);
1071        goto retry;
1072    }
1073
1074    head = &hb1->chain;
1075
1076    plist_for_each_entry_safe(this, next, head, list) {
1077        if (match_futex (&this->key, &key1)) {
1078            wake_futex(this);
1079            if (++ret >= nr_wake)
1080                break;
1081        }
1082    }
1083
1084    if (op_ret > 0) {
1085        head = &hb2->chain;
1086
1087        op_ret = 0;
1088        plist_for_each_entry_safe(this, next, head, list) {
1089            if (match_futex (&this->key, &key2)) {
1090                wake_futex(this);
1091                if (++op_ret >= nr_wake2)
1092                    break;
1093            }
1094        }
1095        ret += op_ret;
1096    }
1097
1098    double_unlock_hb(hb1, hb2);
1099out_put_keys:
1100    put_futex_key(&key2);
1101out_put_key1:
1102    put_futex_key(&key1);
1103out:
1104    return ret;
1105}
1106
1107/**
1108 * requeue_futex() - Requeue a futex_q from one hb to another
1109 * @q: the futex_q to requeue
1110 * @hb1: the source hash_bucket
1111 * @hb2: the target hash_bucket
1112 * @key2: the new key for the requeued futex_q
1113 */
1114static inline
1115void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1116           struct futex_hash_bucket *hb2, union futex_key *key2)
1117{
1118
1119    /*
1120     * If key1 and key2 hash to the same bucket, no need to
1121     * requeue.
1122     */
1123    if (likely(&hb1->chain != &hb2->chain)) {
1124        plist_del(&q->list, &hb1->chain);
1125        plist_add(&q->list, &hb2->chain);
1126        q->lock_ptr = &hb2->lock;
1127    }
1128    get_futex_key_refs(key2);
1129    q->key = *key2;
1130}
1131
1132/**
1133 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1134 * @q: the futex_q
1135 * @key: the key of the requeue target futex
1136 * @hb: the hash_bucket of the requeue target futex
1137 *
1138 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
1139 * target futex if it is uncontended or via a lock steal. Set the futex_q key
1140 * to the requeue target futex so the waiter can detect the wakeup on the right
1141 * futex, but remove it from the hb and NULL the rt_waiter so it can detect
1142 * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock
1143 * to protect access to the pi_state to fixup the owner later. Must be called
1144 * with both q->lock_ptr and hb->lock held.
1145 */
1146static inline
1147void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1148               struct futex_hash_bucket *hb)
1149{
1150    get_futex_key_refs(key);
1151    q->key = *key;
1152
1153    __unqueue_futex(q);
1154
1155    WARN_ON(!q->rt_waiter);
1156    q->rt_waiter = NULL;
1157
1158    q->lock_ptr = &hb->lock;
1159
1160    wake_up_state(q->task, TASK_NORMAL);
1161}
1162
1163/**
1164 * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
1165 * @pifutex: the user address of the to futex
1166 * @hb1: the from futex hash bucket, must be locked by the caller
1167 * @hb2: the to futex hash bucket, must be locked by the caller
1168 * @key1: the from futex key
1169 * @key2: the to futex key
1170 * @ps: address to store the pi_state pointer
1171 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
1172 *
1173 * Try and get the lock on behalf of the top waiter if we can do it atomically.
1174 * Wake the top waiter if we succeed. If the caller specified set_waiters,
1175 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
1176 * hb1 and hb2 must be held by the caller.
1177 *
1178 * Returns:
1179 * 0 - failed to acquire the lock atomicly
1180 * 1 - acquired the lock
1181 * <0 - error
1182 */
1183static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1184                 struct futex_hash_bucket *hb1,
1185                 struct futex_hash_bucket *hb2,
1186                 union futex_key *key1, union futex_key *key2,
1187                 struct futex_pi_state **ps, int set_waiters)
1188{
1189    struct futex_q *top_waiter = NULL;
1190    u32 curval;
1191    int ret;
1192
1193    if (get_futex_value_locked(&curval, pifutex))
1194        return -EFAULT;
1195
1196    /*
1197     * Find the top_waiter and determine if there are additional waiters.
1198     * If the caller intends to requeue more than 1 waiter to pifutex,
1199     * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
1200     * as we have means to handle the possible fault. If not, don't set
1201     * the bit unecessarily as it will force the subsequent unlock to enter
1202     * the kernel.
1203     */
1204    top_waiter = futex_top_waiter(hb1, key1);
1205
1206    /* There are no waiters, nothing for us to do. */
1207    if (!top_waiter)
1208        return 0;
1209
1210    /* Ensure we requeue to the expected futex. */
1211    if (!match_futex(top_waiter->requeue_pi_key, key2))
1212        return -EINVAL;
1213
1214    /*
1215     * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
1216     * the contended case or if set_waiters is 1. The pi_state is returned
1217     * in ps in contended cases.
1218     */
1219    ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1220                   set_waiters);
1221    if (ret == 1)
1222        requeue_pi_wake_futex(top_waiter, key2, hb2);
1223
1224    return ret;
1225}
1226
1227/**
1228 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1229 * @uaddr1: source futex user address
1230 * @flags: futex flags (FLAGS_SHARED, etc.)
1231 * @uaddr2: target futex user address
1232 * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
1233 * @nr_requeue: number of waiters to requeue (0-INT_MAX)
1234 * @cmpval: @uaddr1 expected value (or %NULL)
1235 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
1236 * pi futex (pi to pi requeue is not supported)
1237 *
1238 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1239 * uaddr2 atomically on behalf of the top waiter.
1240 *
1241 * Returns:
1242 * >=0 - on success, the number of tasks requeued or woken
1243 * <0 - on error
1244 */
1245static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1246             u32 __user *uaddr2, int nr_wake, int nr_requeue,
1247             u32 *cmpval, int requeue_pi)
1248{
1249    union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1250    int drop_count = 0, task_count = 0, ret;
1251    struct futex_pi_state *pi_state = NULL;
1252    struct futex_hash_bucket *hb1, *hb2;
1253    struct plist_head *head1;
1254    struct futex_q *this, *next;
1255    u32 curval2;
1256
1257    if (requeue_pi) {
1258        /*
1259         * requeue_pi requires a pi_state, try to allocate it now
1260         * without any locks in case it fails.
1261         */
1262        if (refill_pi_state_cache())
1263            return -ENOMEM;
1264        /*
1265         * requeue_pi must wake as many tasks as it can, up to nr_wake
1266         * + nr_requeue, since it acquires the rt_mutex prior to
1267         * returning to userspace, so as to not leave the rt_mutex with
1268         * waiters and no owner. However, second and third wake-ups
1269         * cannot be predicted as they involve race conditions with the
1270         * first wake and a fault while looking up the pi_state. Both
1271         * pthread_cond_signal() and pthread_cond_broadcast() should
1272         * use nr_wake=1.
1273         */
1274        if (nr_wake != 1)
1275            return -EINVAL;
1276    }
1277
1278retry:
1279    if (pi_state != NULL) {
1280        /*
1281         * We will have to lookup the pi_state again, so free this one
1282         * to keep the accounting correct.
1283         */
1284        free_pi_state(pi_state);
1285        pi_state = NULL;
1286    }
1287
1288    ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1289    if (unlikely(ret != 0))
1290        goto out;
1291    ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
1292                requeue_pi ? VERIFY_WRITE : VERIFY_READ);
1293    if (unlikely(ret != 0))
1294        goto out_put_key1;
1295
1296    hb1 = hash_futex(&key1);
1297    hb2 = hash_futex(&key2);
1298
1299retry_private:
1300    double_lock_hb(hb1, hb2);
1301
1302    if (likely(cmpval != NULL)) {
1303        u32 curval;
1304
1305        ret = get_futex_value_locked(&curval, uaddr1);
1306
1307        if (unlikely(ret)) {
1308            double_unlock_hb(hb1, hb2);
1309
1310            ret = get_user(curval, uaddr1);
1311            if (ret)
1312                goto out_put_keys;
1313
1314            if (!(flags & FLAGS_SHARED))
1315                goto retry_private;
1316
1317            put_futex_key(&key2);
1318            put_futex_key(&key1);
1319            goto retry;
1320        }
1321        if (curval != *cmpval) {
1322            ret = -EAGAIN;
1323            goto out_unlock;
1324        }
1325    }
1326
1327    if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
1328        /*
1329         * Attempt to acquire uaddr2 and wake the top waiter. If we
1330         * intend to requeue waiters, force setting the FUTEX_WAITERS
1331         * bit. We force this here where we are able to easily handle
1332         * faults rather in the requeue loop below.
1333         */
1334        ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
1335                         &key2, &pi_state, nr_requeue);
1336
1337        /*
1338         * At this point the top_waiter has either taken uaddr2 or is
1339         * waiting on it. If the former, then the pi_state will not
1340         * exist yet, look it up one more time to ensure we have a
1341         * reference to it.
1342         */
1343        if (ret == 1) {
1344            WARN_ON(pi_state);
1345            drop_count++;
1346            task_count++;
1347            ret = get_futex_value_locked(&curval2, uaddr2);
1348            if (!ret)
1349                ret = lookup_pi_state(curval2, hb2, &key2,
1350                              &pi_state);
1351        }
1352
1353        switch (ret) {
1354        case 0:
1355            break;
1356        case -EFAULT:
1357            double_unlock_hb(hb1, hb2);
1358            put_futex_key(&key2);
1359            put_futex_key(&key1);
1360            ret = fault_in_user_writeable(uaddr2);
1361            if (!ret)
1362                goto retry;
1363            goto out;
1364        case -EAGAIN:
1365            /* The owner was exiting, try again. */
1366            double_unlock_hb(hb1, hb2);
1367            put_futex_key(&key2);
1368            put_futex_key(&key1);
1369            cond_resched();
1370            goto retry;
1371        default:
1372            goto out_unlock;
1373        }
1374    }
1375
1376    head1 = &hb1->chain;
1377    plist_for_each_entry_safe(this, next, head1, list) {
1378        if (task_count - nr_wake >= nr_requeue)
1379            break;
1380
1381        if (!match_futex(&this->key, &key1))
1382            continue;
1383
1384        /*
1385         * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
1386         * be paired with each other and no other futex ops.
1387         */
1388        if ((requeue_pi && !this->rt_waiter) ||
1389            (!requeue_pi && this->rt_waiter)) {
1390            ret = -EINVAL;
1391            break;
1392        }
1393
1394        /*
1395         * Wake nr_wake waiters. For requeue_pi, if we acquired the
1396         * lock, we already woke the top_waiter. If not, it will be
1397         * woken by futex_unlock_pi().
1398         */
1399        if (++task_count <= nr_wake && !requeue_pi) {
1400            wake_futex(this);
1401            continue;
1402        }
1403
1404        /* Ensure we requeue to the expected futex for requeue_pi. */
1405        if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
1406            ret = -EINVAL;
1407            break;
1408        }
1409
1410        /*
1411         * Requeue nr_requeue waiters and possibly one more in the case
1412         * of requeue_pi if we couldn't acquire the lock atomically.
1413         */
1414        if (requeue_pi) {
1415            /* Prepare the waiter to take the rt_mutex. */
1416            atomic_inc(&pi_state->refcount);
1417            this->pi_state = pi_state;
1418            ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
1419                            this->rt_waiter,
1420                            this->task, 1);
1421            if (ret == 1) {
1422                /* We got the lock. */
1423                requeue_pi_wake_futex(this, &key2, hb2);
1424                drop_count++;
1425                continue;
1426            } else if (ret) {
1427                /* -EDEADLK */
1428                this->pi_state = NULL;
1429                free_pi_state(pi_state);
1430                goto out_unlock;
1431            }
1432        }
1433        requeue_futex(this, hb1, hb2, &key2);
1434        drop_count++;
1435    }
1436
1437out_unlock:
1438    double_unlock_hb(hb1, hb2);
1439
1440    /*
1441     * drop_futex_key_refs() must be called outside the spinlocks. During
1442     * the requeue we moved futex_q's from the hash bucket at key1 to the
1443     * one at key2 and updated their key pointer. We no longer need to
1444     * hold the references to key1.
1445     */
1446    while (--drop_count >= 0)
1447        drop_futex_key_refs(&key1);
1448
1449out_put_keys:
1450    put_futex_key(&key2);
1451out_put_key1:
1452    put_futex_key(&key1);
1453out:
1454    if (pi_state != NULL)
1455        free_pi_state(pi_state);
1456    return ret ? ret : task_count;
1457}
1458
1459/* The key must be already stored in q->key. */
1460static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1461    __acquires(&hb->lock)
1462{
1463    struct futex_hash_bucket *hb;
1464
1465    hb = hash_futex(&q->key);
1466    q->lock_ptr = &hb->lock;
1467
1468    spin_lock(&hb->lock);
1469    return hb;
1470}
1471
1472static inline void
1473queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1474    __releases(&hb->lock)
1475{
1476    spin_unlock(&hb->lock);
1477}
1478
1479/**
1480 * queue_me() - Enqueue the futex_q on the futex_hash_bucket
1481 * @q: The futex_q to enqueue
1482 * @hb: The destination hash bucket
1483 *
1484 * The hb->lock must be held by the caller, and is released here. A call to
1485 * queue_me() is typically paired with exactly one call to unqueue_me(). The
1486 * exceptions involve the PI related operations, which may use unqueue_me_pi()
1487 * or nothing if the unqueue is done as part of the wake process and the unqueue
1488 * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
1489 * an example).
1490 */
1491static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1492    __releases(&hb->lock)
1493{
1494    int prio;
1495
1496    /*
1497     * The priority used to register this element is
1498     * - either the real thread-priority for the real-time threads
1499     * (i.e. threads with a priority lower than MAX_RT_PRIO)
1500     * - or MAX_RT_PRIO for non-RT threads.
1501     * Thus, all RT-threads are woken first in priority order, and
1502     * the others are woken last, in FIFO order.
1503     */
1504    prio = min(current->normal_prio, MAX_RT_PRIO);
1505
1506    plist_node_init(&q->list, prio);
1507    plist_add(&q->list, &hb->chain);
1508    q->task = current;
1509    spin_unlock(&hb->lock);
1510}
1511
1512/**
1513 * unqueue_me() - Remove the futex_q from its futex_hash_bucket
1514 * @q: The futex_q to unqueue
1515 *
1516 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
1517 * be paired with exactly one earlier call to queue_me().
1518 *
1519 * Returns:
1520 * 1 - if the futex_q was still queued (and we removed unqueued it)
1521 * 0 - if the futex_q was already removed by the waking thread
1522 */
1523static int unqueue_me(struct futex_q *q)
1524{
1525    spinlock_t *lock_ptr;
1526    int ret = 0;
1527
1528    /* In the common case we don't take the spinlock, which is nice. */
1529retry:
1530    lock_ptr = q->lock_ptr;
1531    barrier();
1532    if (lock_ptr != NULL) {
1533        spin_lock(lock_ptr);
1534        /*
1535         * q->lock_ptr can change between reading it and
1536         * spin_lock(), causing us to take the wrong lock. This
1537         * corrects the race condition.
1538         *
1539         * Reasoning goes like this: if we have the wrong lock,
1540         * q->lock_ptr must have changed (maybe several times)
1541         * between reading it and the spin_lock(). It can
1542         * change again after the spin_lock() but only if it was
1543         * already changed before the spin_lock(). It cannot,
1544         * however, change back to the original value. Therefore
1545         * we can detect whether we acquired the correct lock.
1546         */
1547        if (unlikely(lock_ptr != q->lock_ptr)) {
1548            spin_unlock(lock_ptr);
1549            goto retry;
1550        }
1551        __unqueue_futex(q);
1552
1553        BUG_ON(q->pi_state);
1554
1555        spin_unlock(lock_ptr);
1556        ret = 1;
1557    }
1558
1559    drop_futex_key_refs(&q->key);
1560    return ret;
1561}
1562
1563/*
1564 * PI futexes can not be requeued and must remove themself from the
1565 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
1566 * and dropped here.
1567 */
1568static void unqueue_me_pi(struct futex_q *q)
1569    __releases(q->lock_ptr)
1570{
1571    __unqueue_futex(q);
1572
1573    BUG_ON(!q->pi_state);
1574    free_pi_state(q->pi_state);
1575    q->pi_state = NULL;
1576
1577    spin_unlock(q->lock_ptr);
1578}
1579
1580/*
1581 * Fixup the pi_state owner with the new owner.
1582 *
1583 * Must be called with hash bucket lock held and mm->sem held for non
1584 * private futexes.
1585 */
1586static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1587                struct task_struct *newowner)
1588{
1589    u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1590    struct futex_pi_state *pi_state = q->pi_state;
1591    struct task_struct *oldowner = pi_state->owner;
1592    u32 uval, uninitialized_var(curval), newval;
1593    int ret;
1594
1595    /* Owner died? */
1596    if (!pi_state->owner)
1597        newtid |= FUTEX_OWNER_DIED;
1598
1599    /*
1600     * We are here either because we stole the rtmutex from the
1601     * previous highest priority waiter or we are the highest priority
1602     * waiter but failed to get the rtmutex the first time.
1603     * We have to replace the newowner TID in the user space variable.
1604     * This must be atomic as we have to preserve the owner died bit here.
1605     *
1606     * Note: We write the user space value _before_ changing the pi_state
1607     * because we can fault here. Imagine swapped out pages or a fork
1608     * that marked all the anonymous memory readonly for cow.
1609     *
1610     * Modifying pi_state _before_ the user space value would
1611     * leave the pi_state in an inconsistent state when we fault
1612     * here, because we need to drop the hash bucket lock to
1613     * handle the fault. This might be observed in the PID check
1614     * in lookup_pi_state.
1615     */
1616retry:
1617    if (get_futex_value_locked(&uval, uaddr))
1618        goto handle_fault;
1619
1620    while (1) {
1621        newval = (uval & FUTEX_OWNER_DIED) | newtid;
1622
1623        if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1624            goto handle_fault;
1625        if (curval == uval)
1626            break;
1627        uval = curval;
1628    }
1629
1630    /*
1631     * We fixed up user space. Now we need to fix the pi_state
1632     * itself.
1633     */
1634    if (pi_state->owner != NULL) {
1635        raw_spin_lock_irq(&pi_state->owner->pi_lock);
1636        WARN_ON(list_empty(&pi_state->list));
1637        list_del_init(&pi_state->list);
1638        raw_spin_unlock_irq(&pi_state->owner->pi_lock);
1639    }
1640
1641    pi_state->owner = newowner;
1642
1643    raw_spin_lock_irq(&newowner->pi_lock);
1644    WARN_ON(!list_empty(&pi_state->list));
1645    list_add(&pi_state->list, &newowner->pi_state_list);
1646    raw_spin_unlock_irq(&newowner->pi_lock);
1647    return 0;
1648
1649    /*
1650     * To handle the page fault we need to drop the hash bucket
1651     * lock here. That gives the other task (either the highest priority
1652     * waiter itself or the task which stole the rtmutex) the
1653     * chance to try the fixup of the pi_state. So once we are
1654     * back from handling the fault we need to check the pi_state
1655     * after reacquiring the hash bucket lock and before trying to
1656     * do another fixup. When the fixup has been done already we
1657     * simply return.
1658     */
1659handle_fault:
1660    spin_unlock(q->lock_ptr);
1661
1662    ret = fault_in_user_writeable(uaddr);
1663
1664    spin_lock(q->lock_ptr);
1665
1666    /*
1667     * Check if someone else fixed it for us:
1668     */
1669    if (pi_state->owner != oldowner)
1670        return 0;
1671
1672    if (ret)
1673        return ret;
1674
1675    goto retry;
1676}
1677
1678static long futex_wait_restart(struct restart_block *restart);
1679
1680/**
1681 * fixup_owner() - Post lock pi_state and corner case management
1682 * @uaddr: user address of the futex
1683 * @q: futex_q (contains pi_state and access to the rt_mutex)
1684 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
1685 *
1686 * After attempting to lock an rt_mutex, this function is called to cleanup
1687 * the pi_state owner as well as handle race conditions that may allow us to
1688 * acquire the lock. Must be called with the hb lock held.
1689 *
1690 * Returns:
1691 * 1 - success, lock taken
1692 * 0 - success, lock not taken
1693 * <0 - on error (-EFAULT)
1694 */
1695static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
1696{
1697    struct task_struct *owner;
1698    int ret = 0;
1699
1700    if (locked) {
1701        /*
1702         * Got the lock. We might not be the anticipated owner if we
1703         * did a lock-steal - fix up the PI-state in that case:
1704         */
1705        if (q->pi_state->owner != current)
1706            ret = fixup_pi_state_owner(uaddr, q, current);
1707        goto out;
1708    }
1709
1710    /*
1711     * Catch the rare case, where the lock was released when we were on the
1712     * way back before we locked the hash bucket.
1713     */
1714    if (q->pi_state->owner == current) {
1715        /*
1716         * Try to get the rt_mutex now. This might fail as some other
1717         * task acquired the rt_mutex after we removed ourself from the
1718         * rt_mutex waiters list.
1719         */
1720        if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
1721            locked = 1;
1722            goto out;
1723        }
1724
1725        /*
1726         * pi_state is incorrect, some other task did a lock steal and
1727         * we returned due to timeout or signal without taking the
1728         * rt_mutex. Too late.
1729         */
1730        raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
1731        owner = rt_mutex_owner(&q->pi_state->pi_mutex);
1732        if (!owner)
1733            owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
1734        raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
1735        ret = fixup_pi_state_owner(uaddr, q, owner);
1736        goto out;
1737    }
1738
1739    /*
1740     * Paranoia check. If we did not take the lock, then we should not be
1741     * the owner of the rt_mutex.
1742     */
1743    if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
1744        printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
1745                "pi-state %p\n", ret,
1746                q->pi_state->pi_mutex.owner,
1747                q->pi_state->owner);
1748
1749out:
1750    return ret ? ret : locked;
1751}
1752
1753/**
1754 * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
1755 * @hb: the futex hash bucket, must be locked by the caller
1756 * @q: the futex_q to queue up on
1757 * @timeout: the prepared hrtimer_sleeper, or null for no timeout
1758 */
1759static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1760                struct hrtimer_sleeper *timeout)
1761{
1762    /*
1763     * The task state is guaranteed to be set before another task can
1764     * wake it. set_current_state() is implemented using set_mb() and
1765     * queue_me() calls spin_unlock() upon completion, both serializing
1766     * access to the hash list and forcing another memory barrier.
1767     */
1768    set_current_state(TASK_INTERRUPTIBLE);
1769    queue_me(q, hb);
1770
1771    /* Arm the timer */
1772    if (timeout) {
1773        hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
1774        if (!hrtimer_active(&timeout->timer))
1775            timeout->task = NULL;
1776    }
1777
1778    /*
1779     * If we have been removed from the hash list, then another task
1780     * has tried to wake us, and we can skip the call to schedule().
1781     */
1782    if (likely(!plist_node_empty(&q->list))) {
1783        /*
1784         * If the timer has already expired, current will already be
1785         * flagged for rescheduling. Only call schedule if there
1786         * is no timeout, or if it has yet to expire.
1787         */
1788        if (!timeout || timeout->task)
1789            schedule();
1790    }
1791    __set_current_state(TASK_RUNNING);
1792}
1793
1794/**
1795 * futex_wait_setup() - Prepare to wait on a futex
1796 * @uaddr: the futex userspace address
1797 * @val: the expected value
1798 * @flags: futex flags (FLAGS_SHARED, etc.)
1799 * @q: the associated futex_q
1800 * @hb: storage for hash_bucket pointer to be returned to caller
1801 *
1802 * Setup the futex_q and locate the hash_bucket. Get the futex value and
1803 * compare it with the expected value. Handle atomic faults internally.
1804 * Return with the hb lock held and a q.key reference on success, and unlocked
1805 * with no q.key reference on failure.
1806 *
1807 * Returns:
1808 * 0 - uaddr contains val and hb has been locked
1809 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
1810 */
1811static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
1812               struct futex_q *q, struct futex_hash_bucket **hb)
1813{
1814    u32 uval;
1815    int ret;
1816
1817    /*
1818     * Access the page AFTER the hash-bucket is locked.
1819     * Order is important:
1820     *
1821     * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
1822     * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); }
1823     *
1824     * The basic logical guarantee of a futex is that it blocks ONLY
1825     * if cond(var) is known to be true at the time of blocking, for
1826     * any cond. If we locked the hash-bucket after testing *uaddr, that
1827     * would open a race condition where we could block indefinitely with
1828     * cond(var) false, which would violate the guarantee.
1829     *
1830     * On the other hand, we insert q and release the hash-bucket only
1831     * after testing *uaddr. This guarantees that futex_wait() will NOT
1832     * absorb a wakeup if *uaddr does not match the desired values
1833     * while the syscall executes.
1834     */
1835retry:
1836    ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ);
1837    if (unlikely(ret != 0))
1838        return ret;
1839
1840retry_private:
1841    *hb = queue_lock(q);
1842
1843    ret = get_futex_value_locked(&uval, uaddr);
1844
1845    if (ret) {
1846        queue_unlock(q, *hb);
1847
1848        ret = get_user(uval, uaddr);
1849        if (ret)
1850            goto out;
1851
1852        if (!(flags & FLAGS_SHARED))
1853            goto retry_private;
1854
1855        put_futex_key(&q->key);
1856        goto retry;
1857    }
1858
1859    if (uval != val) {
1860        queue_unlock(q, *hb);
1861        ret = -EWOULDBLOCK;
1862    }
1863
1864out:
1865    if (ret)
1866        put_futex_key(&q->key);
1867    return ret;
1868}
1869
1870static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
1871              ktime_t *abs_time, u32 bitset)
1872{
1873    struct hrtimer_sleeper timeout, *to = NULL;
1874    struct restart_block *restart;
1875    struct futex_hash_bucket *hb;
1876    struct futex_q q = futex_q_init;
1877    int ret;
1878
1879    if (!bitset)
1880        return -EINVAL;
1881    q.bitset = bitset;
1882
1883    if (abs_time) {
1884        to = &timeout;
1885
1886        hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
1887                      CLOCK_REALTIME : CLOCK_MONOTONIC,
1888                      HRTIMER_MODE_ABS);
1889        hrtimer_init_sleeper(to, current);
1890        hrtimer_set_expires_range_ns(&to->timer, *abs_time,
1891                         current->timer_slack_ns);
1892    }
1893
1894retry:
1895    /*
1896     * Prepare to wait on uaddr. On success, holds hb lock and increments
1897     * q.key refs.
1898     */
1899    ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
1900    if (ret)
1901        goto out;
1902
1903    /* queue_me and wait for wakeup, timeout, or a signal. */
1904    futex_wait_queue_me(hb, &q, to);
1905
1906    /* If we were woken (and unqueued), we succeeded, whatever. */
1907    ret = 0;
1908    /* unqueue_me() drops q.key ref */
1909    if (!unqueue_me(&q))
1910        goto out;
1911    ret = -ETIMEDOUT;
1912    if (to && !to->task)
1913        goto out;
1914
1915    /*
1916     * We expect signal_pending(current), but we might be the
1917     * victim of a spurious wakeup as well.
1918     */
1919    if (!signal_pending(current))
1920        goto retry;
1921
1922    ret = -ERESTARTSYS;
1923    if (!abs_time)
1924        goto out;
1925
1926    restart = &current_thread_info()->restart_block;
1927    restart->fn = futex_wait_restart;
1928    restart->futex.uaddr = uaddr;
1929    restart->futex.val = val;
1930    restart->futex.time = abs_time->tv64;
1931    restart->futex.bitset = bitset;
1932    restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
1933
1934    ret = -ERESTART_RESTARTBLOCK;
1935
1936out:
1937    if (to) {
1938        hrtimer_cancel(&to->timer);
1939        destroy_hrtimer_on_stack(&to->timer);
1940    }
1941    return ret;
1942}
1943
1944
1945static long futex_wait_restart(struct restart_block *restart)
1946{
1947    u32 __user *uaddr = restart->futex.uaddr;
1948    ktime_t t, *tp = NULL;
1949
1950    if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
1951        t.tv64 = restart->futex.time;
1952        tp = &t;
1953    }
1954    restart->fn = do_no_restart_syscall;
1955
1956    return (long)futex_wait(uaddr, restart->futex.flags,
1957                restart->futex.val, tp, restart->futex.bitset);
1958}
1959
1960
1961/*
1962 * Userspace tried a 0 -> TID atomic transition of the futex value
1963 * and failed. The kernel side here does the whole locking operation:
1964 * if there are waiters then it will block, it does PI, etc. (Due to
1965 * races the kernel might see a 0 value of the futex too.)
1966 */
1967static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
1968             ktime_t *time, int trylock)
1969{
1970    struct hrtimer_sleeper timeout, *to = NULL;
1971    struct futex_hash_bucket *hb;
1972    struct futex_q q = futex_q_init;
1973    int res, ret;
1974
1975    if (refill_pi_state_cache())
1976        return -ENOMEM;
1977
1978    if (time) {
1979        to = &timeout;
1980        hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
1981                      HRTIMER_MODE_ABS);
1982        hrtimer_init_sleeper(to, current);
1983        hrtimer_set_expires(&to->timer, *time);
1984    }
1985
1986retry:
1987    ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
1988    if (unlikely(ret != 0))
1989        goto out;
1990
1991retry_private:
1992    hb = queue_lock(&q);
1993
1994    ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
1995    if (unlikely(ret)) {
1996        switch (ret) {
1997        case 1:
1998            /* We got the lock. */
1999            ret = 0;
2000            goto out_unlock_put_key;
2001        case -EFAULT:
2002            goto uaddr_faulted;
2003        case -EAGAIN:
2004            /*
2005             * Task is exiting and we just wait for the
2006             * exit to complete.
2007             */
2008            queue_unlock(&q, hb);
2009            put_futex_key(&q.key);
2010            cond_resched();
2011            goto retry;
2012        default:
2013            goto out_unlock_put_key;
2014        }
2015    }
2016
2017    /*
2018     * Only actually queue now that the atomic ops are done:
2019     */
2020    queue_me(&q, hb);
2021
2022    WARN_ON(!q.pi_state);
2023    /*
2024     * Block on the PI mutex:
2025     */
2026    if (!trylock)
2027        ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
2028    else {
2029        ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
2030        /* Fixup the trylock return value: */
2031        ret = ret ? 0 : -EWOULDBLOCK;
2032    }
2033
2034    spin_lock(q.lock_ptr);
2035    /*
2036     * Fixup the pi_state owner and possibly acquire the lock if we
2037     * haven't already.
2038     */
2039    res = fixup_owner(uaddr, &q, !ret);
2040    /*
2041     * If fixup_owner() returned an error, proprogate that. If it acquired
2042     * the lock, clear our -ETIMEDOUT or -EINTR.
2043     */
2044    if (res)
2045        ret = (res < 0) ? res : 0;
2046
2047    /*
2048     * If fixup_owner() faulted and was unable to handle the fault, unlock
2049     * it and return the fault to userspace.
2050     */
2051    if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
2052        rt_mutex_unlock(&q.pi_state->pi_mutex);
2053
2054    /* Unqueue and drop the lock */
2055    unqueue_me_pi(&q);
2056
2057    goto out_put_key;
2058
2059out_unlock_put_key:
2060    queue_unlock(&q, hb);
2061
2062out_put_key:
2063    put_futex_key(&q.key);
2064out:
2065    if (to)
2066        destroy_hrtimer_on_stack(&to->timer);
2067    return ret != -EINTR ? ret : -ERESTARTNOINTR;
2068
2069uaddr_faulted:
2070    queue_unlock(&q, hb);
2071
2072    ret = fault_in_user_writeable(uaddr);
2073    if (ret)
2074        goto out_put_key;
2075
2076    if (!(flags & FLAGS_SHARED))
2077        goto retry_private;
2078
2079    put_futex_key(&q.key);
2080    goto retry;
2081}
2082
2083/*
2084 * Userspace attempted a TID -> 0 atomic transition, and failed.
2085 * This is the in-kernel slowpath: we look up the PI state (if any),
2086 * and do the rt-mutex unlock.
2087 */
2088static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2089{
2090    struct futex_hash_bucket *hb;
2091    struct futex_q *this, *next;
2092    struct plist_head *head;
2093    union futex_key key = FUTEX_KEY_INIT;
2094    u32 uval, vpid = task_pid_vnr(current);
2095    int ret;
2096
2097retry:
2098    if (get_user(uval, uaddr))
2099        return -EFAULT;
2100    /*
2101     * We release only a lock we actually own:
2102     */
2103    if ((uval & FUTEX_TID_MASK) != vpid)
2104        return -EPERM;
2105
2106    ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
2107    if (unlikely(ret != 0))
2108        goto out;
2109
2110    hb = hash_futex(&key);
2111    spin_lock(&hb->lock);
2112
2113    /*
2114     * To avoid races, try to do the TID -> 0 atomic transition
2115     * again. If it succeeds then we can return without waking
2116     * anyone else up:
2117     */
2118    if (!(uval & FUTEX_OWNER_DIED) &&
2119        cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
2120        goto pi_faulted;
2121    /*
2122     * Rare case: we managed to release the lock atomically,
2123     * no need to wake anyone else up:
2124     */
2125    if (unlikely(uval == vpid))
2126        goto out_unlock;
2127
2128    /*
2129     * Ok, other tasks may need to be woken up - check waiters
2130     * and do the wakeup if necessary:
2131     */
2132    head = &hb->chain;
2133
2134    plist_for_each_entry_safe(this, next, head, list) {
2135        if (!match_futex (&this->key, &key))
2136            continue;
2137        ret = wake_futex_pi(uaddr, uval, this);
2138        /*
2139         * The atomic access to the futex value
2140         * generated a pagefault, so retry the
2141         * user-access and the wakeup:
2142         */
2143        if (ret == -EFAULT)
2144            goto pi_faulted;
2145        goto out_unlock;
2146    }
2147    /*
2148     * No waiters - kernel unlocks the futex:
2149     */
2150    if (!(uval & FUTEX_OWNER_DIED)) {
2151        ret = unlock_futex_pi(uaddr, uval);
2152        if (ret == -EFAULT)
2153            goto pi_faulted;
2154    }
2155
2156out_unlock:
2157    spin_unlock(&hb->lock);
2158    put_futex_key(&key);
2159
2160out:
2161    return ret;
2162
2163pi_faulted:
2164    spin_unlock(&hb->lock);
2165    put_futex_key(&key);
2166
2167    ret = fault_in_user_writeable(uaddr);
2168    if (!ret)
2169        goto retry;
2170
2171    return ret;
2172}
2173
2174/**
2175 * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
2176 * @hb: the hash_bucket futex_q was original enqueued on
2177 * @q: the futex_q woken while waiting to be requeued
2178 * @key2: the futex_key of the requeue target futex
2179 * @timeout: the timeout associated with the wait (NULL if none)
2180 *
2181 * Detect if the task was woken on the initial futex as opposed to the requeue
2182 * target futex. If so, determine if it was a timeout or a signal that caused
2183 * the wakeup and return the appropriate error code to the caller. Must be
2184 * called with the hb lock held.
2185 *
2186 * Returns
2187 * 0 - no early wakeup detected
2188 * <0 - -ETIMEDOUT or -ERESTARTNOINTR
2189 */
2190static inline
2191int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2192                   struct futex_q *q, union futex_key *key2,
2193                   struct hrtimer_sleeper *timeout)
2194{
2195    int ret = 0;
2196
2197    /*
2198     * With the hb lock held, we avoid races while we process the wakeup.
2199     * We only need to hold hb (and not hb2) to ensure atomicity as the
2200     * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
2201     * It can't be requeued from uaddr2 to something else since we don't
2202     * support a PI aware source futex for requeue.
2203     */
2204    if (!match_futex(&q->key, key2)) {
2205        WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
2206        /*
2207         * We were woken prior to requeue by a timeout or a signal.
2208         * Unqueue the futex_q and determine which it was.
2209         */
2210        plist_del(&q->list, &hb->chain);
2211
2212        /* Handle spurious wakeups gracefully */
2213        ret = -EWOULDBLOCK;
2214        if (timeout && !timeout->task)
2215            ret = -ETIMEDOUT;
2216        else if (signal_pending(current))
2217            ret = -ERESTARTNOINTR;
2218    }
2219    return ret;
2220}
2221
2222/**
2223 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2224 * @uaddr: the futex we initially wait on (non-pi)
2225 * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
2226 * the same type, no requeueing from private to shared, etc.
2227 * @val: the expected value of uaddr
2228 * @abs_time: absolute timeout
2229 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
2230 * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
2231 * @uaddr2: the pi futex we will take prior to returning to user-space
2232 *
2233 * The caller will wait on uaddr and will be requeued by futex_requeue() to
2234 * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and
2235 * complete the acquisition of the rt_mutex prior to returning to userspace.
2236 * This ensures the rt_mutex maintains an owner when it has waiters; without
2237 * one, the pi logic wouldn't know which task to boost/deboost, if there was a
2238 * need to.
2239 *
2240 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2241 * via the following:
2242 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2243 * 2) wakeup on uaddr2 after a requeue
2244 * 3) signal
2245 * 4) timeout
2246 *
2247 * If 3, cleanup and return -ERESTARTNOINTR.
2248 *
2249 * If 2, we may then block on trying to take the rt_mutex and return via:
2250 * 5) successful lock
2251 * 6) signal
2252 * 7) timeout
2253 * 8) other lock acquisition failure
2254 *
2255 * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
2256 *
2257 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2258 *
2259 * Returns:
2260 * 0 - On success
2261 * <0 - On error
2262 */
2263static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2264                 u32 val, ktime_t *abs_time, u32 bitset,
2265                 u32 __user *uaddr2)
2266{
2267    struct hrtimer_sleeper timeout, *to = NULL;
2268    struct rt_mutex_waiter rt_waiter;
2269    struct rt_mutex *pi_mutex = NULL;
2270    struct futex_hash_bucket *hb;
2271    union futex_key key2 = FUTEX_KEY_INIT;
2272    struct futex_q q = futex_q_init;
2273    int res, ret;
2274
2275    if (!bitset)
2276        return -EINVAL;
2277
2278    if (abs_time) {
2279        to = &timeout;
2280        hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2281                      CLOCK_REALTIME : CLOCK_MONOTONIC,
2282                      HRTIMER_MODE_ABS);
2283        hrtimer_init_sleeper(to, current);
2284        hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2285                         current->timer_slack_ns);
2286    }
2287
2288    /*
2289     * The waiter is allocated on our stack, manipulated by the requeue
2290     * code while we sleep on uaddr.
2291     */
2292    debug_rt_mutex_init_waiter(&rt_waiter);
2293    rt_waiter.task = NULL;
2294
2295    ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
2296    if (unlikely(ret != 0))
2297        goto out;
2298
2299    q.bitset = bitset;
2300    q.rt_waiter = &rt_waiter;
2301    q.requeue_pi_key = &key2;
2302
2303    /*
2304     * Prepare to wait on uaddr. On success, increments q.key (key1) ref
2305     * count.
2306     */
2307    ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2308    if (ret)
2309        goto out_key2;
2310
2311    /* Queue the futex_q, drop the hb lock, wait for wakeup. */
2312    futex_wait_queue_me(hb, &q, to);
2313
2314    spin_lock(&hb->lock);
2315    ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
2316    spin_unlock(&hb->lock);
2317    if (ret)
2318        goto out_put_keys;
2319
2320    /*
2321     * In order for us to be here, we know our q.key == key2, and since
2322     * we took the hb->lock above, we also know that futex_requeue() has
2323     * completed and we no longer have to concern ourselves with a wakeup
2324     * race with the atomic proxy lock acquisition by the requeue code. The
2325     * futex_requeue dropped our key1 reference and incremented our key2
2326     * reference count.
2327     */
2328
2329    /* Check if the requeue code acquired the second futex for us. */
2330    if (!q.rt_waiter) {
2331        /*
2332         * Got the lock. We might not be the anticipated owner if we
2333         * did a lock-steal - fix up the PI-state in that case.
2334         */
2335        if (q.pi_state && (q.pi_state->owner != current)) {
2336            spin_lock(q.lock_ptr);
2337            ret = fixup_pi_state_owner(uaddr2, &q, current);
2338            spin_unlock(q.lock_ptr);
2339        }
2340    } else {
2341        /*
2342         * We have been woken up by futex_unlock_pi(), a timeout, or a
2343         * signal. futex_unlock_pi() will not destroy the lock_ptr nor
2344         * the pi_state.
2345         */
2346        WARN_ON(!&q.pi_state);
2347        pi_mutex = &q.pi_state->pi_mutex;
2348        ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
2349        debug_rt_mutex_free_waiter(&rt_waiter);
2350
2351        spin_lock(q.lock_ptr);
2352        /*
2353         * Fixup the pi_state owner and possibly acquire the lock if we
2354         * haven't already.
2355         */
2356        res = fixup_owner(uaddr2, &q, !ret);
2357        /*
2358         * If fixup_owner() returned an error, proprogate that. If it
2359         * acquired the lock, clear -ETIMEDOUT or -EINTR.
2360         */
2361        if (res)
2362            ret = (res < 0) ? res : 0;
2363
2364        /* Unqueue and drop the lock. */
2365        unqueue_me_pi(&q);
2366    }
2367
2368    /*
2369     * If fixup_pi_state_owner() faulted and was unable to handle the
2370     * fault, unlock the rt_mutex and return the fault to userspace.
2371     */
2372    if (ret == -EFAULT) {
2373        if (rt_mutex_owner(pi_mutex) == current)
2374            rt_mutex_unlock(pi_mutex);
2375    } else if (ret == -EINTR) {
2376        /*
2377         * We've already been requeued, but cannot restart by calling
2378         * futex_lock_pi() directly. We could restart this syscall, but
2379         * it would detect that the user space "val" changed and return
2380         * -EWOULDBLOCK. Save the overhead of the restart and return
2381         * -EWOULDBLOCK directly.
2382         */
2383        ret = -EWOULDBLOCK;
2384    }
2385
2386out_put_keys:
2387    put_futex_key(&q.key);
2388out_key2:
2389    put_futex_key(&key2);
2390
2391out:
2392    if (to) {
2393        hrtimer_cancel(&to->timer);
2394        destroy_hrtimer_on_stack(&to->timer);
2395    }
2396    return ret;
2397}
2398
2399/*
2400 * Support for robust futexes: the kernel cleans up held futexes at
2401 * thread exit time.
2402 *
2403 * Implementation: user-space maintains a per-thread list of locks it
2404 * is holding. Upon do_exit(), the kernel carefully walks this list,
2405 * and marks all locks that are owned by this thread with the
2406 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
2407 * always manipulated with the lock held, so the list is private and
2408 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
2409 * field, to allow the kernel to clean up if the thread dies after
2410 * acquiring the lock, but just before it could have added itself to
2411 * the list. There can only be one such pending lock.
2412 */
2413
2414/**
2415 * sys_set_robust_list() - Set the robust-futex list head of a task
2416 * @head: pointer to the list-head
2417 * @len: length of the list-head, as userspace expects
2418 */
2419SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
2420        size_t, len)
2421{
2422    if (!futex_cmpxchg_enabled)
2423        return -ENOSYS;
2424    /*
2425     * The kernel knows only one size for now:
2426     */
2427    if (unlikely(len != sizeof(*head)))
2428        return -EINVAL;
2429
2430    current->robust_list = head;
2431
2432    return 0;
2433}
2434
2435/**
2436 * sys_get_robust_list() - Get the robust-futex list head of a task
2437 * @pid: pid of the process [zero for current task]
2438 * @head_ptr: pointer to a list-head pointer, the kernel fills it in
2439 * @len_ptr: pointer to a length field, the kernel fills in the header size
2440 */
2441SYSCALL_DEFINE3(get_robust_list, int, pid,
2442        struct robust_list_head __user * __user *, head_ptr,
2443        size_t __user *, len_ptr)
2444{
2445    struct robust_list_head __user *head;
2446    unsigned long ret;
2447    struct task_struct *p;
2448
2449    if (!futex_cmpxchg_enabled)
2450        return -ENOSYS;
2451
2452    WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
2453
2454    rcu_read_lock();
2455
2456    ret = -ESRCH;
2457    if (!pid)
2458        p = current;
2459    else {
2460        p = find_task_by_vpid(pid);
2461        if (!p)
2462            goto err_unlock;
2463    }
2464
2465    ret = -EPERM;
2466    if (!ptrace_may_access(p, PTRACE_MODE_READ))
2467        goto err_unlock;
2468
2469    head = p->robust_list;
2470    rcu_read_unlock();
2471
2472    if (put_user(sizeof(*head), len_ptr))
2473        return -EFAULT;
2474    return put_user(head, head_ptr);
2475
2476err_unlock:
2477    rcu_read_unlock();
2478
2479    return ret;
2480}
2481
2482/*
2483 * Process a futex-list entry, check whether it's owned by the
2484 * dying task, and do notification if so:
2485 */
2486int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
2487{
2488    u32 uval, uninitialized_var(nval), mval;
2489
2490retry:
2491    if (get_user(uval, uaddr))
2492        return -1;
2493
2494    if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
2495        /*
2496         * Ok, this dying thread is truly holding a futex
2497         * of interest. Set the OWNER_DIED bit atomically
2498         * via cmpxchg, and if the value had FUTEX_WAITERS
2499         * set, wake up a waiter (if any). (We have to do a
2500         * futex_wake() even if OWNER_DIED is already set -
2501         * to handle the rare but possible case of recursive
2502         * thread-death.) The rest of the cleanup is done in
2503         * userspace.
2504         */
2505        mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
2506        /*
2507         * We are not holding a lock here, but we want to have
2508         * the pagefault_disable/enable() protection because
2509         * we want to handle the fault gracefully. If the
2510         * access fails we try to fault in the futex with R/W
2511         * verification via get_user_pages. get_user() above
2512         * does not guarantee R/W access. If that fails we
2513         * give up and leave the futex locked.
2514         */
2515        if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
2516            if (fault_in_user_writeable(uaddr))
2517                return -1;
2518            goto retry;
2519        }
2520        if (nval != uval)
2521            goto retry;
2522
2523        /*
2524         * Wake robust non-PI futexes here. The wakeup of
2525         * PI futexes happens in exit_pi_state():
2526         */
2527        if (!pi && (uval & FUTEX_WAITERS))
2528            futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
2529    }
2530    return 0;
2531}
2532
2533/*
2534 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
2535 */
2536static inline int fetch_robust_entry(struct robust_list __user **entry,
2537                     struct robust_list __user * __user *head,
2538                     unsigned int *pi)
2539{
2540    unsigned long uentry;
2541
2542    if (get_user(uentry, (unsigned long __user *)head))
2543        return -EFAULT;
2544
2545    *entry = (void __user *)(uentry & ~1UL);
2546    *pi = uentry & 1;
2547
2548    return 0;
2549}
2550
2551/*
2552 * Walk curr->robust_list (very carefully, it's a userspace list!)
2553 * and mark any locks found there dead, and notify any waiters.
2554 *
2555 * We silently return on any sign of list-walking problem.
2556 */
2557void exit_robust_list(struct task_struct *curr)
2558{
2559    struct robust_list_head __user *head = curr->robust_list;
2560    struct robust_list __user *entry, *next_entry, *pending;
2561    unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
2562    unsigned int uninitialized_var(next_pi);
2563    unsigned long futex_offset;
2564    int rc;
2565
2566    if (!futex_cmpxchg_enabled)
2567        return;
2568
2569    /*
2570     * Fetch the list head (which was registered earlier, via
2571     * sys_set_robust_list()):
2572     */
2573    if (fetch_robust_entry(&entry, &head->list.next, &pi))
2574        return;
2575    /*
2576     * Fetch the relative futex offset:
2577     */
2578    if (get_user(futex_offset, &head->futex_offset))
2579        return;
2580    /*
2581     * Fetch any possibly pending lock-add first, and handle it
2582     * if it exists:
2583     */
2584    if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
2585        return;
2586
2587    next_entry = NULL; /* avoid warning with gcc */
2588    while (entry != &head->list) {
2589        /*
2590         * Fetch the next entry in the list before calling
2591         * handle_futex_death:
2592         */
2593        rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
2594        /*
2595         * A pending lock might already be on the list, so
2596         * don't process it twice:
2597         */
2598        if (entry != pending)
2599            if (handle_futex_death((void __user *)entry + futex_offset,
2600                        curr, pi))
2601                return;
2602        if (rc)
2603            return;
2604        entry = next_entry;
2605        pi = next_pi;
2606        /*
2607         * Avoid excessively long or circular lists:
2608         */
2609        if (!--limit)
2610            break;
2611
2612        cond_resched();
2613    }
2614
2615    if (pending)
2616        handle_futex_death((void __user *)pending + futex_offset,
2617                   curr, pip);
2618}
2619
2620long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2621        u32 __user *uaddr2, u32 val2, u32 val3)
2622{
2623    int cmd = op & FUTEX_CMD_MASK;
2624    unsigned int flags = 0;
2625
2626    if (!(op & FUTEX_PRIVATE_FLAG))
2627        flags |= FLAGS_SHARED;
2628
2629    if (op & FUTEX_CLOCK_REALTIME) {
2630        flags |= FLAGS_CLOCKRT;
2631        if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
2632            return -ENOSYS;
2633    }
2634
2635    switch (cmd) {
2636    case FUTEX_LOCK_PI:
2637    case FUTEX_UNLOCK_PI:
2638    case FUTEX_TRYLOCK_PI:
2639    case FUTEX_WAIT_REQUEUE_PI:
2640    case FUTEX_CMP_REQUEUE_PI:
2641        if (!futex_cmpxchg_enabled)
2642            return -ENOSYS;
2643    }
2644
2645    switch (cmd) {
2646    case FUTEX_WAIT:
2647        val3 = FUTEX_BITSET_MATCH_ANY;
2648    case FUTEX_WAIT_BITSET:
2649        return futex_wait(uaddr, flags, val, timeout, val3);
2650    case FUTEX_WAKE:
2651        val3 = FUTEX_BITSET_MATCH_ANY;
2652    case FUTEX_WAKE_BITSET:
2653        return futex_wake(uaddr, flags, val, val3);
2654    case FUTEX_REQUEUE:
2655        return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
2656    case FUTEX_CMP_REQUEUE:
2657        return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
2658    case FUTEX_WAKE_OP:
2659        return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
2660    case FUTEX_LOCK_PI:
2661        return futex_lock_pi(uaddr, flags, val, timeout, 0);
2662    case FUTEX_UNLOCK_PI:
2663        return futex_unlock_pi(uaddr, flags);
2664    case FUTEX_TRYLOCK_PI:
2665        return futex_lock_pi(uaddr, flags, 0, timeout, 1);
2666    case FUTEX_WAIT_REQUEUE_PI:
2667        val3 = FUTEX_BITSET_MATCH_ANY;
2668        return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
2669                         uaddr2);
2670    case FUTEX_CMP_REQUEUE_PI:
2671        return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
2672    }
2673    return -ENOSYS;
2674}
2675
2676
2677SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
2678        struct timespec __user *, utime, u32 __user *, uaddr2,
2679        u32, val3)
2680{
2681    struct timespec ts;
2682    ktime_t t, *tp = NULL;
2683    u32 val2 = 0;
2684    int cmd = op & FUTEX_CMD_MASK;
2685
2686    if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
2687              cmd == FUTEX_WAIT_BITSET ||
2688              cmd == FUTEX_WAIT_REQUEUE_PI)) {
2689        if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
2690            return -EFAULT;
2691        if (!timespec_valid(&ts))
2692            return -EINVAL;
2693
2694        t = timespec_to_ktime(ts);
2695        if (cmd == FUTEX_WAIT)
2696            t = ktime_add_safe(ktime_get(), t);
2697        tp = &t;
2698    }
2699    /*
2700     * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
2701     * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
2702     */
2703    if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
2704        cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
2705        val2 = (u32) (unsigned long) utime;
2706
2707    return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
2708}
2709
2710static int __init futex_init(void)
2711{
2712    u32 curval;
2713    int i;
2714
2715    /*
2716     * This will fail and we want it. Some arch implementations do
2717     * runtime detection of the futex_atomic_cmpxchg_inatomic()
2718     * functionality. We want to know that before we call in any
2719     * of the complex code paths. Also we want to prevent
2720     * registration of robust lists in that case. NULL is
2721     * guaranteed to fault and we get -EFAULT on functional
2722     * implementation, the non-functional ones will return
2723     * -ENOSYS.
2724     */
2725    if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
2726        futex_cmpxchg_enabled = 1;
2727
2728    for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
2729        plist_head_init(&futex_queues[i].chain);
2730        spin_lock_init(&futex_queues[i].lock);
2731    }
2732
2733    return 0;
2734}
2735__initcall(futex_init);
2736

Archive Download this file



interactive