| 1 | From f8cf19c19528a468cc0b9846c0328a94cccdc605 Mon Sep 17 00:00:00 2001 |
| 2 | From: Eric Dumazet <edumazet@google.com> |
| 3 | Date: Fri, 11 May 2012 09:30:50 +0000 |
| 4 | Subject: [PATCH] fq_codel: Fair Queue Codel AQM |
| 5 | |
| 6 | commit 4b549a2ef4bef9965d97cbd992ba67930cd3e0fe upstream. |
| 7 | |
| 8 | Fair Queue Codel packet scheduler |
| 9 | |
| 10 | Principles : |
| 11 | |
| 12 | - Packets are classified (internal classifier or external) on flows. |
| 13 | - This is a Stochastic model (as we use a hash, several flows might |
| 14 | be hashed on same slot) |
| 15 | - Each flow has a CoDel managed queue. |
| 16 | - Flows are linked onto two (Round Robin) lists, |
| 17 | so that new flows have priority on old ones. |
| 18 | |
| 19 | - For a given flow, packets are not reordered (CoDel uses a FIFO) |
| 20 | - head drops only. |
| 21 | - ECN capability is on by default. |
| 22 | - Very low memory footprint (64 bytes per flow) |
| 23 | |
| 24 | tc qdisc ... fq_codel [ limit PACKETS ] [ flows number ] |
| 25 | [ target TIME ] [ interval TIME ] [ noecn ] |
| 26 | [ quantum BYTES ] |
| 27 | |
| 28 | defaults : 1024 flows, 10240 packets limit, quantum : device MTU |
| 29 | target : 5ms (CoDel default) |
| 30 | interval : 100ms (CoDel default) |
| 31 | |
| 32 | Impressive results on load : |
| 33 | |
| 34 | class htb 1:1 root leaf 10: prio 0 quantum 1514 rate 200000Kbit ceil 200000Kbit burst 1475b/8 mpu 0b overhead 0b cburst 1475b/8 mpu 0b overhead 0b level 0 |
| 35 | Sent 43304920109 bytes 33063109 pkt (dropped 0, overlimits 0 requeues 0) |
| 36 | rate 201691Kbit 28595pps backlog 0b 312p requeues 0 |
| 37 | lended: 33063109 borrowed: 0 giants: 0 |
| 38 | tokens: -912 ctokens: -912 |
| 39 | |
| 40 | class fq_codel 10:1735 parent 10: |
| 41 | (dropped 1292, overlimits 0 requeues 0) |
| 42 | backlog 15140b 10p requeues 0 |
| 43 | deficit 1514 count 1 lastcount 1 ldelay 7.1ms |
| 44 | class fq_codel 10:4524 parent 10: |
| 45 | (dropped 1291, overlimits 0 requeues 0) |
| 46 | backlog 16654b 11p requeues 0 |
| 47 | deficit 1514 count 1 lastcount 1 ldelay 7.1ms |
| 48 | class fq_codel 10:4e74 parent 10: |
| 49 | (dropped 1290, overlimits 0 requeues 0) |
| 50 | backlog 6056b 4p requeues 0 |
| 51 | deficit 1514 count 1 lastcount 1 ldelay 6.4ms dropping drop_next 92.0ms |
| 52 | class fq_codel 10:628a parent 10: |
| 53 | (dropped 1289, overlimits 0 requeues 0) |
| 54 | backlog 7570b 5p requeues 0 |
| 55 | deficit 1514 count 1 lastcount 1 ldelay 5.4ms dropping drop_next 90.9ms |
| 56 | class fq_codel 10:a4b3 parent 10: |
| 57 | (dropped 302, overlimits 0 requeues 0) |
| 58 | backlog 16654b 11p requeues 0 |
| 59 | deficit 1514 count 1 lastcount 1 ldelay 7.1ms |
| 60 | class fq_codel 10:c3c2 parent 10: |
| 61 | (dropped 1284, overlimits 0 requeues 0) |
| 62 | backlog 13626b 9p requeues 0 |
| 63 | deficit 1514 count 1 lastcount 1 ldelay 5.9ms |
| 64 | class fq_codel 10:d331 parent 10: |
| 65 | (dropped 299, overlimits 0 requeues 0) |
| 66 | backlog 15140b 10p requeues 0 |
| 67 | deficit 1514 count 1 lastcount 1 ldelay 7.0ms |
| 68 | class fq_codel 10:d526 parent 10: |
| 69 | (dropped 12160, overlimits 0 requeues 0) |
| 70 | backlog 35870b 211p requeues 0 |
| 71 | deficit 1508 count 12160 lastcount 1 ldelay 15.3ms dropping drop_next 247us |
| 72 | class fq_codel 10:e2c6 parent 10: |
| 73 | (dropped 1288, overlimits 0 requeues 0) |
| 74 | backlog 15140b 10p requeues 0 |
| 75 | deficit 1514 count 1 lastcount 1 ldelay 7.1ms |
| 76 | class fq_codel 10:eab5 parent 10: |
| 77 | (dropped 1285, overlimits 0 requeues 0) |
| 78 | backlog 16654b 11p requeues 0 |
| 79 | deficit 1514 count 1 lastcount 1 ldelay 5.9ms |
| 80 | class fq_codel 10:f220 parent 10: |
| 81 | (dropped 1289, overlimits 0 requeues 0) |
| 82 | backlog 15140b 10p requeues 0 |
| 83 | deficit 1514 count 1 lastcount 1 ldelay 7.1ms |
| 84 | |
| 85 | qdisc htb 1: root refcnt 6 r2q 10 default 1 direct_packets_stat 0 ver 3.17 |
| 86 | Sent 43331086547 bytes 33092812 pkt (dropped 0, overlimits 66063544 requeues 71) |
| 87 | rate 201697Kbit 28602pps backlog 0b 260p requeues 71 |
| 88 | qdisc fq_codel 10: parent 1:1 limit 10240p flows 65536 target 5.0ms interval 100.0ms ecn |
| 89 | Sent 43331086547 bytes 33092812 pkt (dropped 949359, overlimits 0 requeues 0) |
| 90 | rate 201697Kbit 28602pps backlog 189352b 260p requeues 0 |
| 91 | maxpacket 1514 drop_overlimit 0 new_flow_count 5582 ecn_mark 125593 |
| 92 | new_flows_len 0 old_flows_len 11 |
| 93 | |
| 94 | PING 172.30.42.18 (172.30.42.18) 56(84) bytes of data. |
| 95 | 64 bytes from 172.30.42.18: icmp_req=1 ttl=64 time=0.227 ms |
| 96 | 64 bytes from 172.30.42.18: icmp_req=2 ttl=64 time=0.165 ms |
| 97 | 64 bytes from 172.30.42.18: icmp_req=3 ttl=64 time=0.166 ms |
| 98 | 64 bytes from 172.30.42.18: icmp_req=4 ttl=64 time=0.151 ms |
| 99 | 64 bytes from 172.30.42.18: icmp_req=5 ttl=64 time=0.164 ms |
| 100 | 64 bytes from 172.30.42.18: icmp_req=6 ttl=64 time=0.172 ms |
| 101 | 64 bytes from 172.30.42.18: icmp_req=7 ttl=64 time=0.175 ms |
| 102 | 64 bytes from 172.30.42.18: icmp_req=8 ttl=64 time=0.183 ms |
| 103 | 64 bytes from 172.30.42.18: icmp_req=9 ttl=64 time=0.158 ms |
| 104 | 64 bytes from 172.30.42.18: icmp_req=10 ttl=64 time=0.200 ms |
| 105 | |
| 106 | 10 packets transmitted, 10 received, 0% packet loss, time 8999ms |
| 107 | rtt min/avg/max/mdev = 0.151/0.176/0.227/0.022 ms |
| 108 | |
| 109 | Much better than SFQ because of priority given to new flows, and fast |
| 110 | path dirtying less cache lines. |
| 111 | |
| 112 | Signed-off-by: Eric Dumazet <edumazet@google.com> |
| 113 | Signed-off-by: David S. Miller <davem@davemloft.net> |
| 114 | --- |
| 115 | include/linux/pkt_sched.h | 54 ++++ |
| 116 | net/sched/Kconfig | 11 + |
| 117 | net/sched/Makefile | 1 + |
| 118 | net/sched/sch_fq_codel.c | 624 +++++++++++++++++++++++++++++++++++++++++++++ |
| 119 | 4 files changed, 690 insertions(+) |
| 120 | create mode 100644 net/sched/sch_fq_codel.c |
| 121 | |
| 122 | --- a/include/linux/pkt_sched.h |
| 123 | +++ b/include/linux/pkt_sched.h |
| 124 | @@ -659,4 +659,58 @@ struct tc_codel_xstats { |
| 125 | __u32 dropping; /* are we in dropping state ? */ |
| 126 | }; |
| 127 | |
| 128 | +/* FQ_CODEL */ |
| 129 | + |
| 130 | +enum { |
| 131 | + TCA_FQ_CODEL_UNSPEC, |
| 132 | + TCA_FQ_CODEL_TARGET, |
| 133 | + TCA_FQ_CODEL_LIMIT, |
| 134 | + TCA_FQ_CODEL_INTERVAL, |
| 135 | + TCA_FQ_CODEL_ECN, |
| 136 | + TCA_FQ_CODEL_FLOWS, |
| 137 | + TCA_FQ_CODEL_QUANTUM, |
| 138 | + __TCA_FQ_CODEL_MAX |
| 139 | +}; |
| 140 | + |
| 141 | +#define TCA_FQ_CODEL_MAX (__TCA_FQ_CODEL_MAX - 1) |
| 142 | + |
| 143 | +enum { |
| 144 | + TCA_FQ_CODEL_XSTATS_QDISC, |
| 145 | + TCA_FQ_CODEL_XSTATS_CLASS, |
| 146 | +}; |
| 147 | + |
| 148 | +struct tc_fq_codel_qd_stats { |
| 149 | + __u32 maxpacket; /* largest packet we've seen so far */ |
| 150 | + __u32 drop_overlimit; /* number of time max qdisc |
| 151 | + * packet limit was hit |
| 152 | + */ |
| 153 | + __u32 ecn_mark; /* number of packets we ECN marked |
| 154 | + * instead of being dropped |
| 155 | + */ |
| 156 | + __u32 new_flow_count; /* number of time packets |
| 157 | + * created a 'new flow' |
| 158 | + */ |
| 159 | + __u32 new_flows_len; /* count of flows in new list */ |
| 160 | + __u32 old_flows_len; /* count of flows in old list */ |
| 161 | +}; |
| 162 | + |
| 163 | +struct tc_fq_codel_cl_stats { |
| 164 | + __s32 deficit; |
| 165 | + __u32 ldelay; /* in-queue delay seen by most recently |
| 166 | + * dequeued packet |
| 167 | + */ |
| 168 | + __u32 count; |
| 169 | + __u32 lastcount; |
| 170 | + __u32 dropping; |
| 171 | + __s32 drop_next; |
| 172 | +}; |
| 173 | + |
| 174 | +struct tc_fq_codel_xstats { |
| 175 | + __u32 type; |
| 176 | + union { |
| 177 | + struct tc_fq_codel_qd_stats qdisc_stats; |
| 178 | + struct tc_fq_codel_cl_stats class_stats; |
| 179 | + }; |
| 180 | +}; |
| 181 | + |
| 182 | #endif |
| 183 | --- a/net/sched/Kconfig |
| 184 | +++ b/net/sched/Kconfig |
| 185 | @@ -261,6 +261,17 @@ config NET_SCH_CODEL |
| 186 | |
| 187 | If unsure, say N. |
| 188 | |
| 189 | +config NET_SCH_FQ_CODEL |
| 190 | + tristate "Fair Queue Controlled Delay AQM (FQ_CODEL)" |
| 191 | + help |
| 192 | + Say Y here if you want to use the FQ Controlled Delay (FQ_CODEL) |
| 193 | + packet scheduling algorithm. |
| 194 | + |
| 195 | + To compile this driver as a module, choose M here: the module |
| 196 | + will be called sch_fq_codel. |
| 197 | + |
| 198 | + If unsure, say N. |
| 199 | + |
| 200 | config NET_SCH_INGRESS |
| 201 | tristate "Ingress Qdisc" |
| 202 | depends on NET_CLS_ACT |
| 203 | --- a/net/sched/Makefile |
| 204 | +++ b/net/sched/Makefile |
| 205 | @@ -37,6 +37,7 @@ obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqpr |
| 206 | obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o |
| 207 | obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o |
| 208 | obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o |
| 209 | +obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o |
| 210 | |
| 211 | obj-$(CONFIG_NET_CLS_U32) += cls_u32.o |
| 212 | obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o |
| 213 | --- /dev/null |
| 214 | +++ b/net/sched/sch_fq_codel.c |
| 215 | @@ -0,0 +1,624 @@ |
| 216 | +/* |
| 217 | + * Fair Queue CoDel discipline |
| 218 | + * |
| 219 | + * This program is free software; you can redistribute it and/or |
| 220 | + * modify it under the terms of the GNU General Public License |
| 221 | + * as published by the Free Software Foundation; either version |
| 222 | + * 2 of the License, or (at your option) any later version. |
| 223 | + * |
| 224 | + * Copyright (C) 2012 Eric Dumazet <edumazet@google.com> |
| 225 | + */ |
| 226 | + |
| 227 | +#include <linux/module.h> |
| 228 | +#include <linux/types.h> |
| 229 | +#include <linux/kernel.h> |
| 230 | +#include <linux/jiffies.h> |
| 231 | +#include <linux/string.h> |
| 232 | +#include <linux/in.h> |
| 233 | +#include <linux/errno.h> |
| 234 | +#include <linux/init.h> |
| 235 | +#include <linux/skbuff.h> |
| 236 | +#include <linux/jhash.h> |
| 237 | +#include <linux/slab.h> |
| 238 | +#include <linux/vmalloc.h> |
| 239 | +#include <net/netlink.h> |
| 240 | +#include <net/pkt_sched.h> |
| 241 | +#include <net/flow_keys.h> |
| 242 | +#include <net/codel.h> |
| 243 | + |
| 244 | +/* Fair Queue CoDel. |
| 245 | + * |
| 246 | + * Principles : |
| 247 | + * Packets are classified (internal classifier or external) on flows. |
| 248 | + * This is a Stochastic model (as we use a hash, several flows |
| 249 | + * might be hashed on same slot) |
| 250 | + * Each flow has a CoDel managed queue. |
| 251 | + * Flows are linked onto two (Round Robin) lists, |
| 252 | + * so that new flows have priority on old ones. |
| 253 | + * |
| 254 | + * For a given flow, packets are not reordered (CoDel uses a FIFO) |
| 255 | + * head drops only. |
| 256 | + * ECN capability is on by default. |
| 257 | + * Low memory footprint (64 bytes per flow) |
| 258 | + */ |
| 259 | + |
| 260 | +struct fq_codel_flow { |
| 261 | + struct sk_buff *head; |
| 262 | + struct sk_buff *tail; |
| 263 | + struct list_head flowchain; |
| 264 | + int deficit; |
| 265 | + u32 dropped; /* number of drops (or ECN marks) on this flow */ |
| 266 | + struct codel_vars cvars; |
| 267 | +}; /* please try to keep this structure <= 64 bytes */ |
| 268 | + |
| 269 | +struct fq_codel_sched_data { |
| 270 | + struct tcf_proto *filter_list; /* optional external classifier */ |
| 271 | + struct fq_codel_flow *flows; /* Flows table [flows_cnt] */ |
| 272 | + u32 *backlogs; /* backlog table [flows_cnt] */ |
| 273 | + u32 flows_cnt; /* number of flows */ |
| 274 | + u32 perturbation; /* hash perturbation */ |
| 275 | + u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ |
| 276 | + struct codel_params cparams; |
| 277 | + struct codel_stats cstats; |
| 278 | + u32 drop_overlimit; |
| 279 | + u32 new_flow_count; |
| 280 | + |
| 281 | + struct list_head new_flows; /* list of new flows */ |
| 282 | + struct list_head old_flows; /* list of old flows */ |
| 283 | +}; |
| 284 | + |
| 285 | +static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q, |
| 286 | + const struct sk_buff *skb) |
| 287 | +{ |
| 288 | + struct flow_keys keys; |
| 289 | + unsigned int hash; |
| 290 | + |
| 291 | + skb_flow_dissect(skb, &keys); |
| 292 | + hash = jhash_3words((__force u32)keys.dst, |
| 293 | + (__force u32)keys.src ^ keys.ip_proto, |
| 294 | + (__force u32)keys.ports, q->perturbation); |
| 295 | + return ((u64)hash * q->flows_cnt) >> 32; |
| 296 | +} |
| 297 | + |
| 298 | +static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, |
| 299 | + int *qerr) |
| 300 | +{ |
| 301 | + struct fq_codel_sched_data *q = qdisc_priv(sch); |
| 302 | + struct tcf_result res; |
| 303 | + int result; |
| 304 | + |
| 305 | + if (TC_H_MAJ(skb->priority) == sch->handle && |
| 306 | + TC_H_MIN(skb->priority) > 0 && |
| 307 | + TC_H_MIN(skb->priority) <= q->flows_cnt) |
| 308 | + return TC_H_MIN(skb->priority); |
| 309 | + |
| 310 | + if (!q->filter_list) |
| 311 | + return fq_codel_hash(q, skb) + 1; |
| 312 | + |
| 313 | + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; |
| 314 | + result = tc_classify(skb, q->filter_list, &res); |
| 315 | + if (result >= 0) { |
| 316 | +#ifdef CONFIG_NET_CLS_ACT |
| 317 | + switch (result) { |
| 318 | + case TC_ACT_STOLEN: |
| 319 | + case TC_ACT_QUEUED: |
| 320 | + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; |
| 321 | + case TC_ACT_SHOT: |
| 322 | + return 0; |
| 323 | + } |
| 324 | +#endif |
| 325 | + if (TC_H_MIN(res.classid) <= q->flows_cnt) |
| 326 | + return TC_H_MIN(res.classid); |
| 327 | + } |
| 328 | + return 0; |
| 329 | +} |
| 330 | + |
| 331 | +/* helper functions : might be changed when/if skb use a standard list_head */ |
| 332 | + |
| 333 | +/* remove one skb from head of slot queue */ |
| 334 | +static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow) |
| 335 | +{ |
| 336 | + struct sk_buff *skb = flow->head; |
| 337 | + |
| 338 | + flow->head = skb->next; |
| 339 | + skb->next = NULL; |
| 340 | + return skb; |
| 341 | +} |
| 342 | + |
| 343 | +/* add skb to flow queue (tail add) */ |
| 344 | +static inline void flow_queue_add(struct fq_codel_flow *flow, |
| 345 | + struct sk_buff *skb) |
| 346 | +{ |
| 347 | + if (flow->head == NULL) |
| 348 | + flow->head = skb; |
| 349 | + else |
| 350 | + flow->tail->next = skb; |
| 351 | + flow->tail = skb; |
| 352 | + skb->next = NULL; |
| 353 | +} |
| 354 | + |
| 355 | +static unsigned int fq_codel_drop(struct Qdisc *sch) |
| 356 | +{ |
| 357 | + struct fq_codel_sched_data *q = qdisc_priv(sch); |
| 358 | + struct sk_buff *skb; |
| 359 | + unsigned int maxbacklog = 0, idx = 0, i, len; |
| 360 | + struct fq_codel_flow *flow; |
| 361 | + |
| 362 | + /* Queue is full! Find the fat flow and drop packet from it. |
| 363 | + * This might sound expensive, but with 1024 flows, we scan |
| 364 | + * 4KB of memory, and we dont need to handle a complex tree |
| 365 | + * in fast path (packet queue/enqueue) with many cache misses. |
| 366 | + */ |
| 367 | + for (i = 0; i < q->flows_cnt; i++) { |
| 368 | + if (q->backlogs[i] > maxbacklog) { |
| 369 | + maxbacklog = q->backlogs[i]; |
| 370 | + idx = i; |
| 371 | + } |
| 372 | + } |
| 373 | + flow = &q->flows[idx]; |
| 374 | + skb = dequeue_head(flow); |
| 375 | + len = qdisc_pkt_len(skb); |
| 376 | + q->backlogs[idx] -= len; |
| 377 | + kfree_skb(skb); |
| 378 | + sch->q.qlen--; |
| 379 | + sch->qstats.drops++; |
| 380 | + sch->qstats.backlog -= len; |
| 381 | + flow->dropped++; |
| 382 | + return idx; |
| 383 | +} |
| 384 | + |
| 385 | +static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) |
| 386 | +{ |
| 387 | + struct fq_codel_sched_data *q = qdisc_priv(sch); |
| 388 | + unsigned int idx; |
| 389 | + struct fq_codel_flow *flow; |
| 390 | + int uninitialized_var(ret); |
| 391 | + |
| 392 | + idx = fq_codel_classify(skb, sch, &ret); |
| 393 | + if (idx == 0) { |
| 394 | + if (ret & __NET_XMIT_BYPASS) |
| 395 | + sch->qstats.drops++; |
| 396 | + kfree_skb(skb); |
| 397 | + return ret; |
| 398 | + } |
| 399 | + idx--; |
| 400 | + |
| 401 | + codel_set_enqueue_time(skb); |
| 402 | + flow = &q->flows[idx]; |
| 403 | + flow_queue_add(flow, skb); |
| 404 | + q->backlogs[idx] += qdisc_pkt_len(skb); |
| 405 | + sch->qstats.backlog += qdisc_pkt_len(skb); |
| 406 | + |
| 407 | + if (list_empty(&flow->flowchain)) { |
| 408 | + list_add_tail(&flow->flowchain, &q->new_flows); |
| 409 | + codel_vars_init(&flow->cvars); |
| 410 | + q->new_flow_count++; |
| 411 | + flow->deficit = q->quantum; |
| 412 | + flow->dropped = 0; |
| 413 | + } |
| 414 | + if (++sch->q.qlen < sch->limit) |
| 415 | + return NET_XMIT_SUCCESS; |
| 416 | + |
| 417 | + q->drop_overlimit++; |
| 418 | + /* Return Congestion Notification only if we dropped a packet |
| 419 | + * from this flow. |
| 420 | + */ |
| 421 | + if (fq_codel_drop(sch) == idx) |
| 422 | + return NET_XMIT_CN; |
| 423 | + |
| 424 | + /* As we dropped a packet, better let upper stack know this */ |
| 425 | + qdisc_tree_decrease_qlen(sch, 1); |
| 426 | + return NET_XMIT_SUCCESS; |
| 427 | +} |
| 428 | + |
| 429 | +/* This is the specific function called from codel_dequeue() |
| 430 | + * to dequeue a packet from queue. Note: backlog is handled in |
| 431 | + * codel, we dont need to reduce it here. |
| 432 | + */ |
| 433 | +static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch) |
| 434 | +{ |
| 435 | + struct fq_codel_flow *flow; |
| 436 | + struct sk_buff *skb = NULL; |
| 437 | + |
| 438 | + flow = container_of(vars, struct fq_codel_flow, cvars); |
| 439 | + if (flow->head) { |
| 440 | + skb = dequeue_head(flow); |
| 441 | + sch->qstats.backlog -= qdisc_pkt_len(skb); |
| 442 | + sch->q.qlen--; |
| 443 | + } |
| 444 | + return skb; |
| 445 | +} |
| 446 | + |
| 447 | +static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) |
| 448 | +{ |
| 449 | + struct fq_codel_sched_data *q = qdisc_priv(sch); |
| 450 | + struct sk_buff *skb; |
| 451 | + struct fq_codel_flow *flow; |
| 452 | + struct list_head *head; |
| 453 | + u32 prev_drop_count, prev_ecn_mark; |
| 454 | + |
| 455 | +begin: |
| 456 | + head = &q->new_flows; |
| 457 | + if (list_empty(head)) { |
| 458 | + head = &q->old_flows; |
| 459 | + if (list_empty(head)) |
| 460 | + return NULL; |
| 461 | + } |
| 462 | + flow = list_first_entry(head, struct fq_codel_flow, flowchain); |
| 463 | + |
| 464 | + if (flow->deficit <= 0) { |
| 465 | + flow->deficit += q->quantum; |
| 466 | + list_move_tail(&flow->flowchain, &q->old_flows); |
| 467 | + goto begin; |
| 468 | + } |
| 469 | + |
| 470 | + prev_drop_count = q->cstats.drop_count; |
| 471 | + prev_ecn_mark = q->cstats.ecn_mark; |
| 472 | + |
| 473 | + skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats, |
| 474 | + dequeue, &q->backlogs[flow - q->flows]); |
| 475 | + |
| 476 | + flow->dropped += q->cstats.drop_count - prev_drop_count; |
| 477 | + flow->dropped += q->cstats.ecn_mark - prev_ecn_mark; |
| 478 | + |
| 479 | + if (!skb) { |
| 480 | + /* force a pass through old_flows to prevent starvation */ |
| 481 | + if ((head == &q->new_flows) && !list_empty(&q->old_flows)) |
| 482 | + list_move_tail(&flow->flowchain, &q->old_flows); |
| 483 | + else |
| 484 | + list_del_init(&flow->flowchain); |
| 485 | + goto begin; |
| 486 | + } |
| 487 | + qdisc_bstats_update(sch, skb); |
| 488 | + flow->deficit -= qdisc_pkt_len(skb); |
| 489 | + /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0, |
| 490 | + * or HTB crashes. Defer it for next round. |
| 491 | + */ |
| 492 | + if (q->cstats.drop_count && sch->q.qlen) { |
| 493 | + qdisc_tree_decrease_qlen(sch, q->cstats.drop_count); |
| 494 | + q->cstats.drop_count = 0; |
| 495 | + } |
| 496 | + return skb; |
| 497 | +} |
| 498 | + |
| 499 | +static void fq_codel_reset(struct Qdisc *sch) |
| 500 | +{ |
| 501 | + struct sk_buff *skb; |
| 502 | + |
| 503 | + while ((skb = fq_codel_dequeue(sch)) != NULL) |
| 504 | + kfree_skb(skb); |
| 505 | +} |
| 506 | + |
| 507 | +static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { |
| 508 | + [TCA_FQ_CODEL_TARGET] = { .type = NLA_U32 }, |
| 509 | + [TCA_FQ_CODEL_LIMIT] = { .type = NLA_U32 }, |
| 510 | + [TCA_FQ_CODEL_INTERVAL] = { .type = NLA_U32 }, |
| 511 | + [TCA_FQ_CODEL_ECN] = { .type = NLA_U32 }, |
| 512 | + [TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 }, |
| 513 | + [TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 }, |
| 514 | +}; |
| 515 | + |
| 516 | +static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) |
| 517 | +{ |
| 518 | + struct fq_codel_sched_data *q = qdisc_priv(sch); |
| 519 | + struct nlattr *tb[TCA_FQ_CODEL_MAX + 1]; |
| 520 | + int err; |
| 521 | + |
| 522 | + if (!opt) |
| 523 | + return -EINVAL; |
| 524 | + |
| 525 | + err = nla_parse_nested(tb, TCA_FQ_CODEL_MAX, opt, fq_codel_policy); |
| 526 | + if (err < 0) |
| 527 | + return err; |
| 528 | + if (tb[TCA_FQ_CODEL_FLOWS]) { |
| 529 | + if (q->flows) |
| 530 | + return -EINVAL; |
| 531 | + q->flows_cnt = nla_get_u32(tb[TCA_FQ_CODEL_FLOWS]); |
| 532 | + if (!q->flows_cnt || |
| 533 | + q->flows_cnt > 65536) |
| 534 | + return -EINVAL; |
| 535 | + } |
| 536 | + sch_tree_lock(sch); |
| 537 | + |
| 538 | + if (tb[TCA_FQ_CODEL_TARGET]) { |
| 539 | + u64 target = nla_get_u32(tb[TCA_FQ_CODEL_TARGET]); |
| 540 | + |
| 541 | + q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT; |
| 542 | + } |
| 543 | + |
| 544 | + if (tb[TCA_FQ_CODEL_INTERVAL]) { |
| 545 | + u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]); |
| 546 | + |
| 547 | + q->cparams.interval = (interval * NSEC_PER_USEC) >> CODEL_SHIFT; |
| 548 | + } |
| 549 | + |
| 550 | + if (tb[TCA_FQ_CODEL_LIMIT]) |
| 551 | + sch->limit = nla_get_u32(tb[TCA_FQ_CODEL_LIMIT]); |
| 552 | + |
| 553 | + if (tb[TCA_FQ_CODEL_ECN]) |
| 554 | + q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]); |
| 555 | + |
| 556 | + if (tb[TCA_FQ_CODEL_QUANTUM]) |
| 557 | + q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM])); |
| 558 | + |
| 559 | + while (sch->q.qlen > sch->limit) { |
| 560 | + struct sk_buff *skb = fq_codel_dequeue(sch); |
| 561 | + |
| 562 | + kfree_skb(skb); |
| 563 | + q->cstats.drop_count++; |
| 564 | + } |
| 565 | + qdisc_tree_decrease_qlen(sch, q->cstats.drop_count); |
| 566 | + q->cstats.drop_count = 0; |
| 567 | + |
| 568 | + sch_tree_unlock(sch); |
| 569 | + return 0; |
| 570 | +} |
| 571 | + |
| 572 | +static void *fq_codel_zalloc(size_t sz) |
| 573 | +{ |
| 574 | + void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN); |
| 575 | + |
| 576 | + if (!ptr) |
| 577 | + ptr = vzalloc(sz); |
| 578 | + return ptr; |
| 579 | +} |
| 580 | + |
| 581 | +static void fq_codel_free(void *addr) |
| 582 | +{ |
| 583 | + if (addr) { |
| 584 | + if (is_vmalloc_addr(addr)) |
| 585 | + vfree(addr); |
| 586 | + else |
| 587 | + kfree(addr); |
| 588 | + } |
| 589 | +} |
| 590 | + |
| 591 | +static void fq_codel_destroy(struct Qdisc *sch) |
| 592 | +{ |
| 593 | + struct fq_codel_sched_data *q = qdisc_priv(sch); |
| 594 | + |
| 595 | + tcf_destroy_chain(&q->filter_list); |
| 596 | + fq_codel_free(q->backlogs); |
| 597 | + fq_codel_free(q->flows); |
| 598 | +} |
| 599 | + |
| 600 | +static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) |
| 601 | +{ |
| 602 | + struct fq_codel_sched_data *q = qdisc_priv(sch); |
| 603 | + int i; |
| 604 | + |
| 605 | + sch->limit = 10*1024; |
| 606 | + q->flows_cnt = 1024; |
| 607 | + q->quantum = psched_mtu(qdisc_dev(sch)); |
| 608 | + q->perturbation = net_random(); |
| 609 | + INIT_LIST_HEAD(&q->new_flows); |
| 610 | + INIT_LIST_HEAD(&q->old_flows); |
| 611 | + codel_params_init(&q->cparams); |
| 612 | + codel_stats_init(&q->cstats); |
| 613 | + q->cparams.ecn = true; |
| 614 | + |
| 615 | + if (opt) { |
| 616 | + int err = fq_codel_change(sch, opt); |
| 617 | + if (err) |
| 618 | + return err; |
| 619 | + } |
| 620 | + |
| 621 | + if (!q->flows) { |
| 622 | + q->flows = fq_codel_zalloc(q->flows_cnt * |
| 623 | + sizeof(struct fq_codel_flow)); |
| 624 | + if (!q->flows) |
| 625 | + return -ENOMEM; |
| 626 | + q->backlogs = fq_codel_zalloc(q->flows_cnt * sizeof(u32)); |
| 627 | + if (!q->backlogs) { |
| 628 | + fq_codel_free(q->flows); |
| 629 | + return -ENOMEM; |
| 630 | + } |
| 631 | + for (i = 0; i < q->flows_cnt; i++) { |
| 632 | + struct fq_codel_flow *flow = q->flows + i; |
| 633 | + |
| 634 | + INIT_LIST_HEAD(&flow->flowchain); |
| 635 | + } |
| 636 | + } |
| 637 | + if (sch->limit >= 1) |
| 638 | + sch->flags |= TCQ_F_CAN_BYPASS; |
| 639 | + else |
| 640 | + sch->flags &= ~TCQ_F_CAN_BYPASS; |
| 641 | + return 0; |
| 642 | +} |
| 643 | + |
| 644 | +static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) |
| 645 | +{ |
| 646 | + struct fq_codel_sched_data *q = qdisc_priv(sch); |
| 647 | + struct nlattr *opts; |
| 648 | + |
| 649 | + opts = nla_nest_start(skb, TCA_OPTIONS); |
| 650 | + if (opts == NULL) |
| 651 | + goto nla_put_failure; |
| 652 | + |
| 653 | + if (nla_put_u32(skb, TCA_FQ_CODEL_TARGET, |
| 654 | + codel_time_to_us(q->cparams.target)) || |
| 655 | + nla_put_u32(skb, TCA_FQ_CODEL_LIMIT, |
| 656 | + sch->limit) || |
| 657 | + nla_put_u32(skb, TCA_FQ_CODEL_INTERVAL, |
| 658 | + codel_time_to_us(q->cparams.interval)) || |
| 659 | + nla_put_u32(skb, TCA_FQ_CODEL_ECN, |
| 660 | + q->cparams.ecn) || |
| 661 | + nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM, |
| 662 | + q->quantum) || |
| 663 | + nla_put_u32(skb, TCA_FQ_CODEL_FLOWS, |
| 664 | + q->flows_cnt)) |
| 665 | + goto nla_put_failure; |
| 666 | + |
| 667 | + nla_nest_end(skb, opts); |
| 668 | + return skb->len; |
| 669 | + |
| 670 | +nla_put_failure: |
| 671 | + return -1; |
| 672 | +} |
| 673 | + |
| 674 | +static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) |
| 675 | +{ |
| 676 | + struct fq_codel_sched_data *q = qdisc_priv(sch); |
| 677 | + struct tc_fq_codel_xstats st = { |
| 678 | + .type = TCA_FQ_CODEL_XSTATS_QDISC, |
| 679 | + .qdisc_stats.maxpacket = q->cstats.maxpacket, |
| 680 | + .qdisc_stats.drop_overlimit = q->drop_overlimit, |
| 681 | + .qdisc_stats.ecn_mark = q->cstats.ecn_mark, |
| 682 | + .qdisc_stats.new_flow_count = q->new_flow_count, |
| 683 | + }; |
| 684 | + struct list_head *pos; |
| 685 | + |
| 686 | + list_for_each(pos, &q->new_flows) |
| 687 | + st.qdisc_stats.new_flows_len++; |
| 688 | + |
| 689 | + list_for_each(pos, &q->old_flows) |
| 690 | + st.qdisc_stats.old_flows_len++; |
| 691 | + |
| 692 | + return gnet_stats_copy_app(d, &st, sizeof(st)); |
| 693 | +} |
| 694 | + |
| 695 | +static struct Qdisc *fq_codel_leaf(struct Qdisc *sch, unsigned long arg) |
| 696 | +{ |
| 697 | + return NULL; |
| 698 | +} |
| 699 | + |
| 700 | +static unsigned long fq_codel_get(struct Qdisc *sch, u32 classid) |
| 701 | +{ |
| 702 | + return 0; |
| 703 | +} |
| 704 | + |
| 705 | +static unsigned long fq_codel_bind(struct Qdisc *sch, unsigned long parent, |
| 706 | + u32 classid) |
| 707 | +{ |
| 708 | + /* we cannot bypass queue discipline anymore */ |
| 709 | + sch->flags &= ~TCQ_F_CAN_BYPASS; |
| 710 | + return 0; |
| 711 | +} |
| 712 | + |
| 713 | +static void fq_codel_put(struct Qdisc *q, unsigned long cl) |
| 714 | +{ |
| 715 | +} |
| 716 | + |
| 717 | +static struct tcf_proto **fq_codel_find_tcf(struct Qdisc *sch, unsigned long cl) |
| 718 | +{ |
| 719 | + struct fq_codel_sched_data *q = qdisc_priv(sch); |
| 720 | + |
| 721 | + if (cl) |
| 722 | + return NULL; |
| 723 | + return &q->filter_list; |
| 724 | +} |
| 725 | + |
| 726 | +static int fq_codel_dump_class(struct Qdisc *sch, unsigned long cl, |
| 727 | + struct sk_buff *skb, struct tcmsg *tcm) |
| 728 | +{ |
| 729 | + tcm->tcm_handle |= TC_H_MIN(cl); |
| 730 | + return 0; |
| 731 | +} |
| 732 | + |
| 733 | +static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, |
| 734 | + struct gnet_dump *d) |
| 735 | +{ |
| 736 | + struct fq_codel_sched_data *q = qdisc_priv(sch); |
| 737 | + u32 idx = cl - 1; |
| 738 | + struct gnet_stats_queue qs = { 0 }; |
| 739 | + struct tc_fq_codel_xstats xstats; |
| 740 | + |
| 741 | + if (idx < q->flows_cnt) { |
| 742 | + const struct fq_codel_flow *flow = &q->flows[idx]; |
| 743 | + const struct sk_buff *skb = flow->head; |
| 744 | + |
| 745 | + memset(&xstats, 0, sizeof(xstats)); |
| 746 | + xstats.type = TCA_FQ_CODEL_XSTATS_CLASS; |
| 747 | + xstats.class_stats.deficit = flow->deficit; |
| 748 | + xstats.class_stats.ldelay = |
| 749 | + codel_time_to_us(flow->cvars.ldelay); |
| 750 | + xstats.class_stats.count = flow->cvars.count; |
| 751 | + xstats.class_stats.lastcount = flow->cvars.lastcount; |
| 752 | + xstats.class_stats.dropping = flow->cvars.dropping; |
| 753 | + if (flow->cvars.dropping) { |
| 754 | + codel_tdiff_t delta = flow->cvars.drop_next - |
| 755 | + codel_get_time(); |
| 756 | + |
| 757 | + xstats.class_stats.drop_next = (delta >= 0) ? |
| 758 | + codel_time_to_us(delta) : |
| 759 | + -codel_time_to_us(-delta); |
| 760 | + } |
| 761 | + while (skb) { |
| 762 | + qs.qlen++; |
| 763 | + skb = skb->next; |
| 764 | + } |
| 765 | + qs.backlog = q->backlogs[idx]; |
| 766 | + qs.drops = flow->dropped; |
| 767 | + } |
| 768 | + if (gnet_stats_copy_queue(d, &qs) < 0) |
| 769 | + return -1; |
| 770 | + if (idx < q->flows_cnt) |
| 771 | + return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); |
| 772 | + return 0; |
| 773 | +} |
| 774 | + |
| 775 | +static void fq_codel_walk(struct Qdisc *sch, struct qdisc_walker *arg) |
| 776 | +{ |
| 777 | + struct fq_codel_sched_data *q = qdisc_priv(sch); |
| 778 | + unsigned int i; |
| 779 | + |
| 780 | + if (arg->stop) |
| 781 | + return; |
| 782 | + |
| 783 | + for (i = 0; i < q->flows_cnt; i++) { |
| 784 | + if (list_empty(&q->flows[i].flowchain) || |
| 785 | + arg->count < arg->skip) { |
| 786 | + arg->count++; |
| 787 | + continue; |
| 788 | + } |
| 789 | + if (arg->fn(sch, i + 1, arg) < 0) { |
| 790 | + arg->stop = 1; |
| 791 | + break; |
| 792 | + } |
| 793 | + arg->count++; |
| 794 | + } |
| 795 | +} |
| 796 | + |
| 797 | +static const struct Qdisc_class_ops fq_codel_class_ops = { |
| 798 | + .leaf = fq_codel_leaf, |
| 799 | + .get = fq_codel_get, |
| 800 | + .put = fq_codel_put, |
| 801 | + .tcf_chain = fq_codel_find_tcf, |
| 802 | + .bind_tcf = fq_codel_bind, |
| 803 | + .unbind_tcf = fq_codel_put, |
| 804 | + .dump = fq_codel_dump_class, |
| 805 | + .dump_stats = fq_codel_dump_class_stats, |
| 806 | + .walk = fq_codel_walk, |
| 807 | +}; |
| 808 | + |
| 809 | +static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = { |
| 810 | + .cl_ops = &fq_codel_class_ops, |
| 811 | + .id = "fq_codel", |
| 812 | + .priv_size = sizeof(struct fq_codel_sched_data), |
| 813 | + .enqueue = fq_codel_enqueue, |
| 814 | + .dequeue = fq_codel_dequeue, |
| 815 | + .peek = qdisc_peek_dequeued, |
| 816 | + .drop = fq_codel_drop, |
| 817 | + .init = fq_codel_init, |
| 818 | + .reset = fq_codel_reset, |
| 819 | + .destroy = fq_codel_destroy, |
| 820 | + .change = fq_codel_change, |
| 821 | + .dump = fq_codel_dump, |
| 822 | + .dump_stats = fq_codel_dump_stats, |
| 823 | + .owner = THIS_MODULE, |
| 824 | +}; |
| 825 | + |
| 826 | +static int __init fq_codel_module_init(void) |
| 827 | +{ |
| 828 | + return register_qdisc(&fq_codel_qdisc_ops); |
| 829 | +} |
| 830 | + |
| 831 | +static void __exit fq_codel_module_exit(void) |
| 832 | +{ |
| 833 | + unregister_qdisc(&fq_codel_qdisc_ops); |
| 834 | +} |
| 835 | + |
| 836 | +module_init(fq_codel_module_init) |
| 837 | +module_exit(fq_codel_module_exit) |
| 838 | +MODULE_AUTHOR("Eric Dumazet"); |
| 839 | +MODULE_LICENSE("GPL"); |
| 840 | |