Root/
1 | /* |
2 | * Copyright (c) 2007 Oracle. All rights reserved. |
3 | * |
4 | * This software is available to you under a choice of one of two |
5 | * licenses. You may choose to be licensed under the terms of the GNU |
6 | * General Public License (GPL) Version 2, available from the file |
7 | * COPYING in the main directory of this source tree, or the |
8 | * OpenIB.org BSD license below: |
9 | * |
10 | * Redistribution and use in source and binary forms, with or |
11 | * without modification, are permitted provided that the following |
12 | * conditions are met: |
13 | * |
14 | * - Redistributions of source code must retain the above |
15 | * copyright notice, this list of conditions and the following |
16 | * disclaimer. |
17 | * |
18 | * - Redistributions in binary form must reproduce the above |
19 | * copyright notice, this list of conditions and the following |
20 | * disclaimer in the documentation and/or other materials |
21 | * provided with the distribution. |
22 | * |
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
30 | * SOFTWARE. |
31 | * |
32 | */ |
33 | #include <linux/slab.h> |
34 | #include <linux/types.h> |
35 | #include <linux/rbtree.h> |
36 | |
37 | #include <asm-generic/bitops/le.h> |
38 | |
39 | #include "rds.h" |
40 | |
41 | /* |
42 | * This file implements the receive side of the unconventional congestion |
43 | * management in RDS. |
44 | * |
45 | * Messages waiting in the receive queue on the receiving socket are accounted |
46 | * against the sockets SO_RCVBUF option value. Only the payload bytes in the |
47 | * message are accounted for. If the number of bytes queued equals or exceeds |
48 | * rcvbuf then the socket is congested. All sends attempted to this socket's |
49 | * address should return block or return -EWOULDBLOCK. |
50 | * |
51 | * Applications are expected to be reasonably tuned such that this situation |
52 | * very rarely occurs. An application encountering this "back-pressure" is |
53 | * considered a bug. |
54 | * |
55 | * This is implemented by having each node maintain bitmaps which indicate |
56 | * which ports on bound addresses are congested. As the bitmap changes it is |
57 | * sent through all the connections which terminate in the local address of the |
58 | * bitmap which changed. |
59 | * |
60 | * The bitmaps are allocated as connections are brought up. This avoids |
61 | * allocation in the interrupt handling path which queues messages on sockets. |
62 | * The dense bitmaps let transports send the entire bitmap on any bitmap change |
63 | * reasonably efficiently. This is much easier to implement than some |
64 | * finer-grained communication of per-port congestion. The sender does a very |
65 | * inexpensive bit test to test if the port it's about to send to is congested |
66 | * or not. |
67 | */ |
68 | |
69 | /* |
70 | * Interaction with poll is a tad tricky. We want all processes stuck in |
71 | * poll to wake up and check whether a congested destination became uncongested. |
72 | * The really sad thing is we have no idea which destinations the application |
73 | * wants to send to - we don't even know which rds_connections are involved. |
74 | * So until we implement a more flexible rds poll interface, we have to make |
75 | * do with this: |
76 | * We maintain a global counter that is incremented each time a congestion map |
77 | * update is received. Each rds socket tracks this value, and if rds_poll |
78 | * finds that the saved generation number is smaller than the global generation |
79 | * number, it wakes up the process. |
80 | */ |
81 | static atomic_t rds_cong_generation = ATOMIC_INIT(0); |
82 | |
83 | /* |
84 | * Congestion monitoring |
85 | */ |
86 | static LIST_HEAD(rds_cong_monitor); |
87 | static DEFINE_RWLOCK(rds_cong_monitor_lock); |
88 | |
89 | /* |
90 | * Yes, a global lock. It's used so infrequently that it's worth keeping it |
91 | * global to simplify the locking. It's only used in the following |
92 | * circumstances: |
93 | * |
94 | * - on connection buildup to associate a conn with its maps |
95 | * - on map changes to inform conns of a new map to send |
96 | * |
97 | * It's sadly ordered under the socket callback lock and the connection lock. |
98 | * Receive paths can mark ports congested from interrupt context so the |
99 | * lock masks interrupts. |
100 | */ |
101 | static DEFINE_SPINLOCK(rds_cong_lock); |
102 | static struct rb_root rds_cong_tree = RB_ROOT; |
103 | |
104 | static struct rds_cong_map *rds_cong_tree_walk(__be32 addr, |
105 | struct rds_cong_map *insert) |
106 | { |
107 | struct rb_node **p = &rds_cong_tree.rb_node; |
108 | struct rb_node *parent = NULL; |
109 | struct rds_cong_map *map; |
110 | |
111 | while (*p) { |
112 | parent = *p; |
113 | map = rb_entry(parent, struct rds_cong_map, m_rb_node); |
114 | |
115 | if (addr < map->m_addr) |
116 | p = &(*p)->rb_left; |
117 | else if (addr > map->m_addr) |
118 | p = &(*p)->rb_right; |
119 | else |
120 | return map; |
121 | } |
122 | |
123 | if (insert) { |
124 | rb_link_node(&insert->m_rb_node, parent, p); |
125 | rb_insert_color(&insert->m_rb_node, &rds_cong_tree); |
126 | } |
127 | return NULL; |
128 | } |
129 | |
130 | /* |
131 | * There is only ever one bitmap for any address. Connections try and allocate |
132 | * these bitmaps in the process getting pointers to them. The bitmaps are only |
133 | * ever freed as the module is removed after all connections have been freed. |
134 | */ |
135 | static struct rds_cong_map *rds_cong_from_addr(__be32 addr) |
136 | { |
137 | struct rds_cong_map *map; |
138 | struct rds_cong_map *ret = NULL; |
139 | unsigned long zp; |
140 | unsigned long i; |
141 | unsigned long flags; |
142 | |
143 | map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL); |
144 | if (map == NULL) |
145 | return NULL; |
146 | |
147 | map->m_addr = addr; |
148 | init_waitqueue_head(&map->m_waitq); |
149 | INIT_LIST_HEAD(&map->m_conn_list); |
150 | |
151 | for (i = 0; i < RDS_CONG_MAP_PAGES; i++) { |
152 | zp = get_zeroed_page(GFP_KERNEL); |
153 | if (zp == 0) |
154 | goto out; |
155 | map->m_page_addrs[i] = zp; |
156 | } |
157 | |
158 | spin_lock_irqsave(&rds_cong_lock, flags); |
159 | ret = rds_cong_tree_walk(addr, map); |
160 | spin_unlock_irqrestore(&rds_cong_lock, flags); |
161 | |
162 | if (ret == NULL) { |
163 | ret = map; |
164 | map = NULL; |
165 | } |
166 | |
167 | out: |
168 | if (map) { |
169 | for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++) |
170 | free_page(map->m_page_addrs[i]); |
171 | kfree(map); |
172 | } |
173 | |
174 | rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr)); |
175 | |
176 | return ret; |
177 | } |
178 | |
179 | /* |
180 | * Put the conn on its local map's list. This is called when the conn is |
181 | * really added to the hash. It's nested under the rds_conn_lock, sadly. |
182 | */ |
183 | void rds_cong_add_conn(struct rds_connection *conn) |
184 | { |
185 | unsigned long flags; |
186 | |
187 | rdsdebug("conn %p now on map %p\n", conn, conn->c_lcong); |
188 | spin_lock_irqsave(&rds_cong_lock, flags); |
189 | list_add_tail(&conn->c_map_item, &conn->c_lcong->m_conn_list); |
190 | spin_unlock_irqrestore(&rds_cong_lock, flags); |
191 | } |
192 | |
193 | void rds_cong_remove_conn(struct rds_connection *conn) |
194 | { |
195 | unsigned long flags; |
196 | |
197 | rdsdebug("removing conn %p from map %p\n", conn, conn->c_lcong); |
198 | spin_lock_irqsave(&rds_cong_lock, flags); |
199 | list_del_init(&conn->c_map_item); |
200 | spin_unlock_irqrestore(&rds_cong_lock, flags); |
201 | } |
202 | |
203 | int rds_cong_get_maps(struct rds_connection *conn) |
204 | { |
205 | conn->c_lcong = rds_cong_from_addr(conn->c_laddr); |
206 | conn->c_fcong = rds_cong_from_addr(conn->c_faddr); |
207 | |
208 | if (conn->c_lcong == NULL || conn->c_fcong == NULL) |
209 | return -ENOMEM; |
210 | |
211 | return 0; |
212 | } |
213 | |
214 | void rds_cong_queue_updates(struct rds_cong_map *map) |
215 | { |
216 | struct rds_connection *conn; |
217 | unsigned long flags; |
218 | |
219 | spin_lock_irqsave(&rds_cong_lock, flags); |
220 | |
221 | list_for_each_entry(conn, &map->m_conn_list, c_map_item) { |
222 | if (conn->c_loopback) |
223 | continue; |
224 | if (!test_and_set_bit(0, &conn->c_map_queued)) { |
225 | rds_stats_inc(s_cong_update_queued); |
226 | queue_delayed_work(rds_wq, &conn->c_send_w, 0); |
227 | } |
228 | } |
229 | |
230 | spin_unlock_irqrestore(&rds_cong_lock, flags); |
231 | } |
232 | |
233 | void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask) |
234 | { |
235 | rdsdebug("waking map %p for %pI4\n", |
236 | map, &map->m_addr); |
237 | rds_stats_inc(s_cong_update_received); |
238 | atomic_inc(&rds_cong_generation); |
239 | if (waitqueue_active(&map->m_waitq)) |
240 | wake_up(&map->m_waitq); |
241 | if (waitqueue_active(&rds_poll_waitq)) |
242 | wake_up_all(&rds_poll_waitq); |
243 | |
244 | if (portmask && !list_empty(&rds_cong_monitor)) { |
245 | unsigned long flags; |
246 | struct rds_sock *rs; |
247 | |
248 | read_lock_irqsave(&rds_cong_monitor_lock, flags); |
249 | list_for_each_entry(rs, &rds_cong_monitor, rs_cong_list) { |
250 | spin_lock(&rs->rs_lock); |
251 | rs->rs_cong_notify |= (rs->rs_cong_mask & portmask); |
252 | rs->rs_cong_mask &= ~portmask; |
253 | spin_unlock(&rs->rs_lock); |
254 | if (rs->rs_cong_notify) |
255 | rds_wake_sk_sleep(rs); |
256 | } |
257 | read_unlock_irqrestore(&rds_cong_monitor_lock, flags); |
258 | } |
259 | } |
260 | EXPORT_SYMBOL_GPL(rds_cong_map_updated); |
261 | |
262 | int rds_cong_updated_since(unsigned long *recent) |
263 | { |
264 | unsigned long gen = atomic_read(&rds_cong_generation); |
265 | |
266 | if (likely(*recent == gen)) |
267 | return 0; |
268 | *recent = gen; |
269 | return 1; |
270 | } |
271 | |
272 | /* |
273 | * We're called under the locking that protects the sockets receive buffer |
274 | * consumption. This makes it a lot easier for the caller to only call us |
275 | * when it knows that an existing set bit needs to be cleared, and vice versa. |
276 | * We can't block and we need to deal with concurrent sockets working against |
277 | * the same per-address map. |
278 | */ |
279 | void rds_cong_set_bit(struct rds_cong_map *map, __be16 port) |
280 | { |
281 | unsigned long i; |
282 | unsigned long off; |
283 | |
284 | rdsdebug("setting congestion for %pI4:%u in map %p\n", |
285 | &map->m_addr, ntohs(port), map); |
286 | |
287 | i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; |
288 | off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; |
289 | |
290 | generic___set_le_bit(off, (void *)map->m_page_addrs[i]); |
291 | } |
292 | |
293 | void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port) |
294 | { |
295 | unsigned long i; |
296 | unsigned long off; |
297 | |
298 | rdsdebug("clearing congestion for %pI4:%u in map %p\n", |
299 | &map->m_addr, ntohs(port), map); |
300 | |
301 | i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; |
302 | off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; |
303 | |
304 | generic___clear_le_bit(off, (void *)map->m_page_addrs[i]); |
305 | } |
306 | |
307 | static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port) |
308 | { |
309 | unsigned long i; |
310 | unsigned long off; |
311 | |
312 | i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; |
313 | off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; |
314 | |
315 | return generic_test_le_bit(off, (void *)map->m_page_addrs[i]); |
316 | } |
317 | |
318 | void rds_cong_add_socket(struct rds_sock *rs) |
319 | { |
320 | unsigned long flags; |
321 | |
322 | write_lock_irqsave(&rds_cong_monitor_lock, flags); |
323 | if (list_empty(&rs->rs_cong_list)) |
324 | list_add(&rs->rs_cong_list, &rds_cong_monitor); |
325 | write_unlock_irqrestore(&rds_cong_monitor_lock, flags); |
326 | } |
327 | |
328 | void rds_cong_remove_socket(struct rds_sock *rs) |
329 | { |
330 | unsigned long flags; |
331 | struct rds_cong_map *map; |
332 | |
333 | write_lock_irqsave(&rds_cong_monitor_lock, flags); |
334 | list_del_init(&rs->rs_cong_list); |
335 | write_unlock_irqrestore(&rds_cong_monitor_lock, flags); |
336 | |
337 | /* update congestion map for now-closed port */ |
338 | spin_lock_irqsave(&rds_cong_lock, flags); |
339 | map = rds_cong_tree_walk(rs->rs_bound_addr, NULL); |
340 | spin_unlock_irqrestore(&rds_cong_lock, flags); |
341 | |
342 | if (map && rds_cong_test_bit(map, rs->rs_bound_port)) { |
343 | rds_cong_clear_bit(map, rs->rs_bound_port); |
344 | rds_cong_queue_updates(map); |
345 | } |
346 | } |
347 | |
348 | int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, |
349 | struct rds_sock *rs) |
350 | { |
351 | if (!rds_cong_test_bit(map, port)) |
352 | return 0; |
353 | if (nonblock) { |
354 | if (rs && rs->rs_cong_monitor) { |
355 | unsigned long flags; |
356 | |
357 | /* It would have been nice to have an atomic set_bit on |
358 | * a uint64_t. */ |
359 | spin_lock_irqsave(&rs->rs_lock, flags); |
360 | rs->rs_cong_mask |= RDS_CONG_MONITOR_MASK(ntohs(port)); |
361 | spin_unlock_irqrestore(&rs->rs_lock, flags); |
362 | |
363 | /* Test again - a congestion update may have arrived in |
364 | * the meantime. */ |
365 | if (!rds_cong_test_bit(map, port)) |
366 | return 0; |
367 | } |
368 | rds_stats_inc(s_cong_send_error); |
369 | return -ENOBUFS; |
370 | } |
371 | |
372 | rds_stats_inc(s_cong_send_blocked); |
373 | rdsdebug("waiting on map %p for port %u\n", map, be16_to_cpu(port)); |
374 | |
375 | return wait_event_interruptible(map->m_waitq, |
376 | !rds_cong_test_bit(map, port)); |
377 | } |
378 | |
379 | void rds_cong_exit(void) |
380 | { |
381 | struct rb_node *node; |
382 | struct rds_cong_map *map; |
383 | unsigned long i; |
384 | |
385 | while ((node = rb_first(&rds_cong_tree))) { |
386 | map = rb_entry(node, struct rds_cong_map, m_rb_node); |
387 | rdsdebug("freeing map %p\n", map); |
388 | rb_erase(&map->m_rb_node, &rds_cong_tree); |
389 | for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++) |
390 | free_page(map->m_page_addrs[i]); |
391 | kfree(map); |
392 | } |
393 | } |
394 | |
395 | /* |
396 | * Allocate a RDS message containing a congestion update. |
397 | */ |
398 | struct rds_message *rds_cong_update_alloc(struct rds_connection *conn) |
399 | { |
400 | struct rds_cong_map *map = conn->c_lcong; |
401 | struct rds_message *rm; |
402 | |
403 | rm = rds_message_map_pages(map->m_page_addrs, RDS_CONG_MAP_BYTES); |
404 | if (!IS_ERR(rm)) |
405 | rm->m_inc.i_hdr.h_flags = RDS_FLAG_CONG_BITMAP; |
406 | |
407 | return rm; |
408 | } |
409 |
Branches:
ben-wpan
ben-wpan-stefan
javiroman/ks7010
jz-2.6.34
jz-2.6.34-rc5
jz-2.6.34-rc6
jz-2.6.34-rc7
jz-2.6.35
jz-2.6.36
jz-2.6.37
jz-2.6.38
jz-2.6.39
jz-3.0
jz-3.1
jz-3.11
jz-3.12
jz-3.13
jz-3.15
jz-3.16
jz-3.18-dt
jz-3.2
jz-3.3
jz-3.4
jz-3.5
jz-3.6
jz-3.6-rc2-pwm
jz-3.9
jz-3.9-clk
jz-3.9-rc8
jz47xx
jz47xx-2.6.38
master
Tags:
od-2011-09-04
od-2011-09-18
v2.6.34-rc5
v2.6.34-rc6
v2.6.34-rc7
v3.9