Root/
1 | /* |
2 | * linux/kernel/seccomp.c |
3 | * |
4 | * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> |
5 | * |
6 | * Copyright (C) 2012 Google, Inc. |
7 | * Will Drewry <wad@chromium.org> |
8 | * |
9 | * This defines a simple but solid secure-computing facility. |
10 | * |
11 | * Mode 1 uses a fixed list of allowed system calls. |
12 | * Mode 2 allows user-defined system call filters in the form |
13 | * of Berkeley Packet Filters/Linux Socket Filters. |
14 | */ |
15 | |
16 | #include <linux/atomic.h> |
17 | #include <linux/audit.h> |
18 | #include <linux/compat.h> |
19 | #include <linux/sched.h> |
20 | #include <linux/seccomp.h> |
21 | |
22 | /* #define SECCOMP_DEBUG 1 */ |
23 | |
24 | #ifdef CONFIG_SECCOMP_FILTER |
25 | #include <asm/syscall.h> |
26 | #include <linux/filter.h> |
27 | #include <linux/ptrace.h> |
28 | #include <linux/security.h> |
29 | #include <linux/slab.h> |
30 | #include <linux/tracehook.h> |
31 | #include <linux/uaccess.h> |
32 | |
33 | /** |
34 | * struct seccomp_filter - container for seccomp BPF programs |
35 | * |
36 | * @usage: reference count to manage the object lifetime. |
37 | * get/put helpers should be used when accessing an instance |
38 | * outside of a lifetime-guarded section. In general, this |
39 | * is only needed for handling filters shared across tasks. |
40 | * @prev: points to a previously installed, or inherited, filter |
41 | * @len: the number of instructions in the program |
42 | * @insns: the BPF program instructions to evaluate |
43 | * |
44 | * seccomp_filter objects are organized in a tree linked via the @prev |
45 | * pointer. For any task, it appears to be a singly-linked list starting |
46 | * with current->seccomp.filter, the most recently attached or inherited filter. |
47 | * However, multiple filters may share a @prev node, by way of fork(), which |
48 | * results in a unidirectional tree existing in memory. This is similar to |
49 | * how namespaces work. |
50 | * |
51 | * seccomp_filter objects should never be modified after being attached |
52 | * to a task_struct (other than @usage). |
53 | */ |
54 | struct seccomp_filter { |
55 | atomic_t usage; |
56 | struct seccomp_filter *prev; |
57 | unsigned short len; /* Instruction count */ |
58 | struct sock_filter insns[]; |
59 | }; |
60 | |
61 | /* Limit any path through the tree to 256KB worth of instructions. */ |
62 | #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) |
63 | |
64 | /** |
65 | * get_u32 - returns a u32 offset into data |
66 | * @data: a unsigned 64 bit value |
67 | * @index: 0 or 1 to return the first or second 32-bits |
68 | * |
69 | * This inline exists to hide the length of unsigned long. If a 32-bit |
70 | * unsigned long is passed in, it will be extended and the top 32-bits will be |
71 | * 0. If it is a 64-bit unsigned long, then whatever data is resident will be |
72 | * properly returned. |
73 | * |
74 | * Endianness is explicitly ignored and left for BPF program authors to manage |
75 | * as per the specific architecture. |
76 | */ |
77 | static inline u32 get_u32(u64 data, int index) |
78 | { |
79 | return ((u32 *)&data)[index]; |
80 | } |
81 | |
82 | /* Helper for bpf_load below. */ |
83 | #define BPF_DATA(_name) offsetof(struct seccomp_data, _name) |
84 | /** |
85 | * bpf_load: checks and returns a pointer to the requested offset |
86 | * @off: offset into struct seccomp_data to load from |
87 | * |
88 | * Returns the requested 32-bits of data. |
89 | * seccomp_check_filter() should assure that @off is 32-bit aligned |
90 | * and not out of bounds. Failure to do so is a BUG. |
91 | */ |
92 | u32 seccomp_bpf_load(int off) |
93 | { |
94 | struct pt_regs *regs = task_pt_regs(current); |
95 | if (off == BPF_DATA(nr)) |
96 | return syscall_get_nr(current, regs); |
97 | if (off == BPF_DATA(arch)) |
98 | return syscall_get_arch(current, regs); |
99 | if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) { |
100 | unsigned long value; |
101 | int arg = (off - BPF_DATA(args[0])) / sizeof(u64); |
102 | int index = !!(off % sizeof(u64)); |
103 | syscall_get_arguments(current, regs, arg, 1, &value); |
104 | return get_u32(value, index); |
105 | } |
106 | if (off == BPF_DATA(instruction_pointer)) |
107 | return get_u32(KSTK_EIP(current), 0); |
108 | if (off == BPF_DATA(instruction_pointer) + sizeof(u32)) |
109 | return get_u32(KSTK_EIP(current), 1); |
110 | /* seccomp_check_filter should make this impossible. */ |
111 | BUG(); |
112 | } |
113 | |
114 | /** |
115 | * seccomp_check_filter - verify seccomp filter code |
116 | * @filter: filter to verify |
117 | * @flen: length of filter |
118 | * |
119 | * Takes a previously checked filter (by sk_chk_filter) and |
120 | * redirects all filter code that loads struct sk_buff data |
121 | * and related data through seccomp_bpf_load. It also |
122 | * enforces length and alignment checking of those loads. |
123 | * |
124 | * Returns 0 if the rule set is legal or -EINVAL if not. |
125 | */ |
126 | static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) |
127 | { |
128 | int pc; |
129 | for (pc = 0; pc < flen; pc++) { |
130 | struct sock_filter *ftest = &filter[pc]; |
131 | u16 code = ftest->code; |
132 | u32 k = ftest->k; |
133 | |
134 | switch (code) { |
135 | case BPF_S_LD_W_ABS: |
136 | ftest->code = BPF_S_ANC_SECCOMP_LD_W; |
137 | /* 32-bit aligned and not out of bounds. */ |
138 | if (k >= sizeof(struct seccomp_data) || k & 3) |
139 | return -EINVAL; |
140 | continue; |
141 | case BPF_S_LD_W_LEN: |
142 | ftest->code = BPF_S_LD_IMM; |
143 | ftest->k = sizeof(struct seccomp_data); |
144 | continue; |
145 | case BPF_S_LDX_W_LEN: |
146 | ftest->code = BPF_S_LDX_IMM; |
147 | ftest->k = sizeof(struct seccomp_data); |
148 | continue; |
149 | /* Explicitly include allowed calls. */ |
150 | case BPF_S_RET_K: |
151 | case BPF_S_RET_A: |
152 | case BPF_S_ALU_ADD_K: |
153 | case BPF_S_ALU_ADD_X: |
154 | case BPF_S_ALU_SUB_K: |
155 | case BPF_S_ALU_SUB_X: |
156 | case BPF_S_ALU_MUL_K: |
157 | case BPF_S_ALU_MUL_X: |
158 | case BPF_S_ALU_DIV_X: |
159 | case BPF_S_ALU_AND_K: |
160 | case BPF_S_ALU_AND_X: |
161 | case BPF_S_ALU_OR_K: |
162 | case BPF_S_ALU_OR_X: |
163 | case BPF_S_ALU_XOR_K: |
164 | case BPF_S_ALU_XOR_X: |
165 | case BPF_S_ALU_LSH_K: |
166 | case BPF_S_ALU_LSH_X: |
167 | case BPF_S_ALU_RSH_K: |
168 | case BPF_S_ALU_RSH_X: |
169 | case BPF_S_ALU_NEG: |
170 | case BPF_S_LD_IMM: |
171 | case BPF_S_LDX_IMM: |
172 | case BPF_S_MISC_TAX: |
173 | case BPF_S_MISC_TXA: |
174 | case BPF_S_ALU_DIV_K: |
175 | case BPF_S_LD_MEM: |
176 | case BPF_S_LDX_MEM: |
177 | case BPF_S_ST: |
178 | case BPF_S_STX: |
179 | case BPF_S_JMP_JA: |
180 | case BPF_S_JMP_JEQ_K: |
181 | case BPF_S_JMP_JEQ_X: |
182 | case BPF_S_JMP_JGE_K: |
183 | case BPF_S_JMP_JGE_X: |
184 | case BPF_S_JMP_JGT_K: |
185 | case BPF_S_JMP_JGT_X: |
186 | case BPF_S_JMP_JSET_K: |
187 | case BPF_S_JMP_JSET_X: |
188 | continue; |
189 | default: |
190 | return -EINVAL; |
191 | } |
192 | } |
193 | return 0; |
194 | } |
195 | |
196 | /** |
197 | * seccomp_run_filters - evaluates all seccomp filters against @syscall |
198 | * @syscall: number of the current system call |
199 | * |
200 | * Returns valid seccomp BPF response codes. |
201 | */ |
202 | static u32 seccomp_run_filters(int syscall) |
203 | { |
204 | struct seccomp_filter *f; |
205 | u32 ret = SECCOMP_RET_ALLOW; |
206 | |
207 | /* Ensure unexpected behavior doesn't result in failing open. */ |
208 | if (WARN_ON(current->seccomp.filter == NULL)) |
209 | return SECCOMP_RET_KILL; |
210 | |
211 | /* |
212 | * All filters in the list are evaluated and the lowest BPF return |
213 | * value always takes priority (ignoring the DATA). |
214 | */ |
215 | for (f = current->seccomp.filter; f; f = f->prev) { |
216 | u32 cur_ret = sk_run_filter(NULL, f->insns); |
217 | if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) |
218 | ret = cur_ret; |
219 | } |
220 | return ret; |
221 | } |
222 | |
223 | /** |
224 | * seccomp_attach_filter: Attaches a seccomp filter to current. |
225 | * @fprog: BPF program to install |
226 | * |
227 | * Returns 0 on success or an errno on failure. |
228 | */ |
229 | static long seccomp_attach_filter(struct sock_fprog *fprog) |
230 | { |
231 | struct seccomp_filter *filter; |
232 | unsigned long fp_size = fprog->len * sizeof(struct sock_filter); |
233 | unsigned long total_insns = fprog->len; |
234 | long ret; |
235 | |
236 | if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) |
237 | return -EINVAL; |
238 | |
239 | for (filter = current->seccomp.filter; filter; filter = filter->prev) |
240 | total_insns += filter->len + 4; /* include a 4 instr penalty */ |
241 | if (total_insns > MAX_INSNS_PER_PATH) |
242 | return -ENOMEM; |
243 | |
244 | /* |
245 | * Installing a seccomp filter requires that the task have |
246 | * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. |
247 | * This avoids scenarios where unprivileged tasks can affect the |
248 | * behavior of privileged children. |
249 | */ |
250 | if (!current->no_new_privs && |
251 | security_capable_noaudit(current_cred(), current_user_ns(), |
252 | CAP_SYS_ADMIN) != 0) |
253 | return -EACCES; |
254 | |
255 | /* Allocate a new seccomp_filter */ |
256 | filter = kzalloc(sizeof(struct seccomp_filter) + fp_size, |
257 | GFP_KERNEL|__GFP_NOWARN); |
258 | if (!filter) |
259 | return -ENOMEM; |
260 | atomic_set(&filter->usage, 1); |
261 | filter->len = fprog->len; |
262 | |
263 | /* Copy the instructions from fprog. */ |
264 | ret = -EFAULT; |
265 | if (copy_from_user(filter->insns, fprog->filter, fp_size)) |
266 | goto fail; |
267 | |
268 | /* Check and rewrite the fprog via the skb checker */ |
269 | ret = sk_chk_filter(filter->insns, filter->len); |
270 | if (ret) |
271 | goto fail; |
272 | |
273 | /* Check and rewrite the fprog for seccomp use */ |
274 | ret = seccomp_check_filter(filter->insns, filter->len); |
275 | if (ret) |
276 | goto fail; |
277 | |
278 | /* |
279 | * If there is an existing filter, make it the prev and don't drop its |
280 | * task reference. |
281 | */ |
282 | filter->prev = current->seccomp.filter; |
283 | current->seccomp.filter = filter; |
284 | return 0; |
285 | fail: |
286 | kfree(filter); |
287 | return ret; |
288 | } |
289 | |
290 | /** |
291 | * seccomp_attach_user_filter - attaches a user-supplied sock_fprog |
292 | * @user_filter: pointer to the user data containing a sock_fprog. |
293 | * |
294 | * Returns 0 on success and non-zero otherwise. |
295 | */ |
296 | long seccomp_attach_user_filter(char __user *user_filter) |
297 | { |
298 | struct sock_fprog fprog; |
299 | long ret = -EFAULT; |
300 | |
301 | #ifdef CONFIG_COMPAT |
302 | if (is_compat_task()) { |
303 | struct compat_sock_fprog fprog32; |
304 | if (copy_from_user(&fprog32, user_filter, sizeof(fprog32))) |
305 | goto out; |
306 | fprog.len = fprog32.len; |
307 | fprog.filter = compat_ptr(fprog32.filter); |
308 | } else /* falls through to the if below. */ |
309 | #endif |
310 | if (copy_from_user(&fprog, user_filter, sizeof(fprog))) |
311 | goto out; |
312 | ret = seccomp_attach_filter(&fprog); |
313 | out: |
314 | return ret; |
315 | } |
316 | |
317 | /* get_seccomp_filter - increments the reference count of the filter on @tsk */ |
318 | void get_seccomp_filter(struct task_struct *tsk) |
319 | { |
320 | struct seccomp_filter *orig = tsk->seccomp.filter; |
321 | if (!orig) |
322 | return; |
323 | /* Reference count is bounded by the number of total processes. */ |
324 | atomic_inc(&orig->usage); |
325 | } |
326 | |
327 | /* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ |
328 | void put_seccomp_filter(struct task_struct *tsk) |
329 | { |
330 | struct seccomp_filter *orig = tsk->seccomp.filter; |
331 | /* Clean up single-reference branches iteratively. */ |
332 | while (orig && atomic_dec_and_test(&orig->usage)) { |
333 | struct seccomp_filter *freeme = orig; |
334 | orig = orig->prev; |
335 | kfree(freeme); |
336 | } |
337 | } |
338 | |
339 | /** |
340 | * seccomp_send_sigsys - signals the task to allow in-process syscall emulation |
341 | * @syscall: syscall number to send to userland |
342 | * @reason: filter-supplied reason code to send to userland (via si_errno) |
343 | * |
344 | * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info. |
345 | */ |
346 | static void seccomp_send_sigsys(int syscall, int reason) |
347 | { |
348 | struct siginfo info; |
349 | memset(&info, 0, sizeof(info)); |
350 | info.si_signo = SIGSYS; |
351 | info.si_code = SYS_SECCOMP; |
352 | info.si_call_addr = (void __user *)KSTK_EIP(current); |
353 | info.si_errno = reason; |
354 | info.si_arch = syscall_get_arch(current, task_pt_regs(current)); |
355 | info.si_syscall = syscall; |
356 | force_sig_info(SIGSYS, &info, current); |
357 | } |
358 | #endif /* CONFIG_SECCOMP_FILTER */ |
359 | |
360 | /* |
361 | * Secure computing mode 1 allows only read/write/exit/sigreturn. |
362 | * To be fully secure this must be combined with rlimit |
363 | * to limit the stack allocations too. |
364 | */ |
365 | static int mode1_syscalls[] = { |
366 | __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, |
367 | 0, /* null terminated */ |
368 | }; |
369 | |
370 | #ifdef CONFIG_COMPAT |
371 | static int mode1_syscalls_32[] = { |
372 | __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32, |
373 | 0, /* null terminated */ |
374 | }; |
375 | #endif |
376 | |
377 | int __secure_computing(int this_syscall) |
378 | { |
379 | int mode = current->seccomp.mode; |
380 | int exit_sig = 0; |
381 | int *syscall; |
382 | u32 ret; |
383 | |
384 | switch (mode) { |
385 | case SECCOMP_MODE_STRICT: |
386 | syscall = mode1_syscalls; |
387 | #ifdef CONFIG_COMPAT |
388 | if (is_compat_task()) |
389 | syscall = mode1_syscalls_32; |
390 | #endif |
391 | do { |
392 | if (*syscall == this_syscall) |
393 | return 0; |
394 | } while (*++syscall); |
395 | exit_sig = SIGKILL; |
396 | ret = SECCOMP_RET_KILL; |
397 | break; |
398 | #ifdef CONFIG_SECCOMP_FILTER |
399 | case SECCOMP_MODE_FILTER: { |
400 | int data; |
401 | struct pt_regs *regs = task_pt_regs(current); |
402 | ret = seccomp_run_filters(this_syscall); |
403 | data = ret & SECCOMP_RET_DATA; |
404 | ret &= SECCOMP_RET_ACTION; |
405 | switch (ret) { |
406 | case SECCOMP_RET_ERRNO: |
407 | /* Set the low-order 16-bits as a errno. */ |
408 | syscall_set_return_value(current, regs, |
409 | -data, 0); |
410 | goto skip; |
411 | case SECCOMP_RET_TRAP: |
412 | /* Show the handler the original registers. */ |
413 | syscall_rollback(current, regs); |
414 | /* Let the filter pass back 16 bits of data. */ |
415 | seccomp_send_sigsys(this_syscall, data); |
416 | goto skip; |
417 | case SECCOMP_RET_TRACE: |
418 | /* Skip these calls if there is no tracer. */ |
419 | if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { |
420 | syscall_set_return_value(current, regs, |
421 | -ENOSYS, 0); |
422 | goto skip; |
423 | } |
424 | /* Allow the BPF to provide the event message */ |
425 | ptrace_event(PTRACE_EVENT_SECCOMP, data); |
426 | /* |
427 | * The delivery of a fatal signal during event |
428 | * notification may silently skip tracer notification. |
429 | * Terminating the task now avoids executing a system |
430 | * call that may not be intended. |
431 | */ |
432 | if (fatal_signal_pending(current)) |
433 | break; |
434 | if (syscall_get_nr(current, regs) < 0) |
435 | goto skip; /* Explicit request to skip. */ |
436 | |
437 | return 0; |
438 | case SECCOMP_RET_ALLOW: |
439 | return 0; |
440 | case SECCOMP_RET_KILL: |
441 | default: |
442 | break; |
443 | } |
444 | exit_sig = SIGSYS; |
445 | break; |
446 | } |
447 | #endif |
448 | default: |
449 | BUG(); |
450 | } |
451 | |
452 | #ifdef SECCOMP_DEBUG |
453 | dump_stack(); |
454 | #endif |
455 | audit_seccomp(this_syscall, exit_sig, ret); |
456 | do_exit(exit_sig); |
457 | #ifdef CONFIG_SECCOMP_FILTER |
458 | skip: |
459 | audit_seccomp(this_syscall, exit_sig, ret); |
460 | #endif |
461 | return -1; |
462 | } |
463 | |
464 | long prctl_get_seccomp(void) |
465 | { |
466 | return current->seccomp.mode; |
467 | } |
468 | |
469 | /** |
470 | * prctl_set_seccomp: configures current->seccomp.mode |
471 | * @seccomp_mode: requested mode to use |
472 | * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER |
473 | * |
474 | * This function may be called repeatedly with a @seccomp_mode of |
475 | * SECCOMP_MODE_FILTER to install additional filters. Every filter |
476 | * successfully installed will be evaluated (in reverse order) for each system |
477 | * call the task makes. |
478 | * |
479 | * Once current->seccomp.mode is non-zero, it may not be changed. |
480 | * |
481 | * Returns 0 on success or -EINVAL on failure. |
482 | */ |
483 | long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) |
484 | { |
485 | long ret = -EINVAL; |
486 | |
487 | if (current->seccomp.mode && |
488 | current->seccomp.mode != seccomp_mode) |
489 | goto out; |
490 | |
491 | switch (seccomp_mode) { |
492 | case SECCOMP_MODE_STRICT: |
493 | ret = 0; |
494 | #ifdef TIF_NOTSC |
495 | disable_TSC(); |
496 | #endif |
497 | break; |
498 | #ifdef CONFIG_SECCOMP_FILTER |
499 | case SECCOMP_MODE_FILTER: |
500 | ret = seccomp_attach_user_filter(filter); |
501 | if (ret) |
502 | goto out; |
503 | break; |
504 | #endif |
505 | default: |
506 | goto out; |
507 | } |
508 | |
509 | current->seccomp.mode = seccomp_mode; |
510 | set_thread_flag(TIF_SECCOMP); |
511 | out: |
512 | return ret; |
513 | } |
514 |
Branches:
ben-wpan
ben-wpan-stefan
javiroman/ks7010
jz-2.6.34
jz-2.6.34-rc5
jz-2.6.34-rc6
jz-2.6.34-rc7
jz-2.6.35
jz-2.6.36
jz-2.6.37
jz-2.6.38
jz-2.6.39
jz-3.0
jz-3.1
jz-3.11
jz-3.12
jz-3.13
jz-3.15
jz-3.16
jz-3.18-dt
jz-3.2
jz-3.3
jz-3.4
jz-3.5
jz-3.6
jz-3.6-rc2-pwm
jz-3.9
jz-3.9-clk
jz-3.9-rc8
jz47xx
jz47xx-2.6.38
master
Tags:
od-2011-09-04
od-2011-09-18
v2.6.34-rc5
v2.6.34-rc6
v2.6.34-rc7
v3.9