| 1 | --- a/include/linux/pkt_sched.h |
| 2 | +++ b/include/linux/pkt_sched.h |
| 3 | @@ -173,8 +173,37 @@ struct tc_sfq_xstats { |
| 4 | * |
| 5 | * The only reason for this is efficiency, it is possible |
| 6 | * to change these parameters in compile time. |
| 7 | + * |
| 8 | + * If you need to play with these values, use esfq instead. |
| 9 | */ |
| 10 | |
| 11 | +/* ESFQ section */ |
| 12 | + |
| 13 | +enum |
| 14 | +{ |
| 15 | + /* traditional */ |
| 16 | + TCA_SFQ_HASH_CLASSIC, |
| 17 | + TCA_SFQ_HASH_DST, |
| 18 | + TCA_SFQ_HASH_SRC, |
| 19 | + TCA_SFQ_HASH_FWMARK, |
| 20 | + /* conntrack */ |
| 21 | + TCA_SFQ_HASH_CTORIGDST, |
| 22 | + TCA_SFQ_HASH_CTORIGSRC, |
| 23 | + TCA_SFQ_HASH_CTREPLDST, |
| 24 | + TCA_SFQ_HASH_CTREPLSRC, |
| 25 | + TCA_SFQ_HASH_CTNATCHG, |
| 26 | +}; |
| 27 | + |
| 28 | +struct tc_esfq_qopt |
| 29 | +{ |
| 30 | + unsigned quantum; /* Bytes per round allocated to flow */ |
| 31 | + int perturb_period; /* Period of hash perturbation */ |
| 32 | + __u32 limit; /* Maximal packets in queue */ |
| 33 | + unsigned divisor; /* Hash divisor */ |
| 34 | + unsigned flows; /* Maximal number of flows */ |
| 35 | + unsigned hash_kind; /* Hash function to use for flow identification */ |
| 36 | +}; |
| 37 | + |
| 38 | /* RED section */ |
| 39 | |
| 40 | enum { |
| 41 | --- a/net/sched/Kconfig |
| 42 | +++ b/net/sched/Kconfig |
| 43 | @@ -148,6 +148,37 @@ config NET_SCH_SFQ |
| 44 | To compile this code as a module, choose M here: the |
| 45 | module will be called sch_sfq. |
| 46 | |
| 47 | +config NET_SCH_ESFQ |
| 48 | + tristate "Enhanced Stochastic Fairness Queueing (ESFQ)" |
| 49 | + ---help--- |
| 50 | + Say Y here if you want to use the Enhanced Stochastic Fairness |
| 51 | + Queueing (ESFQ) packet scheduling algorithm for some of your network |
| 52 | + devices or as a leaf discipline for a classful qdisc such as HTB or |
| 53 | + CBQ (see the top of <file:net/sched/sch_esfq.c> for details and |
| 54 | + references to the SFQ algorithm). |
| 55 | + |
| 56 | + This is an enchanced SFQ version which allows you to control some |
| 57 | + hardcoded values in the SFQ scheduler. |
| 58 | + |
| 59 | + ESFQ also adds control of the hash function used to identify packet |
| 60 | + flows. The original SFQ discipline hashes by connection; ESFQ add |
| 61 | + several other hashing methods, such as by src IP or by dst IP, which |
| 62 | + can be more fair to users in some networking situations. |
| 63 | + |
| 64 | + To compile this code as a module, choose M here: the |
| 65 | + module will be called sch_esfq. |
| 66 | + |
| 67 | +config NET_SCH_ESFQ_NFCT |
| 68 | + bool "Connection Tracking Hash Types" |
| 69 | + depends on NET_SCH_ESFQ && NF_CONNTRACK |
| 70 | + ---help--- |
| 71 | + Say Y here to enable support for hashing based on netfilter connection |
| 72 | + tracking information. This is useful for a router that is also using |
| 73 | + NAT to connect privately-addressed hosts to the Internet. If you want |
| 74 | + to provide fair distribution of upstream bandwidth, ESFQ must use |
| 75 | + connection tracking information, since all outgoing packets will share |
| 76 | + the same source address. |
| 77 | + |
| 78 | config NET_SCH_TEQL |
| 79 | tristate "True Link Equalizer (TEQL)" |
| 80 | ---help--- |
| 81 | --- a/net/sched/Makefile |
| 82 | +++ b/net/sched/Makefile |
| 83 | @@ -26,6 +26,7 @@ obj-$(CONFIG_NET_SCH_INGRESS) += sch_ing |
| 84 | obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o |
| 85 | obj-$(CONFIG_NET_SCH_SFB) += sch_sfb.o |
| 86 | obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o |
| 87 | +obj-$(CONFIG_NET_SCH_ESFQ) += sch_esfq.o |
| 88 | obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o |
| 89 | obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o |
| 90 | obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o |
| 91 | --- /dev/null |
| 92 | +++ b/net/sched/sch_esfq.c |
| 93 | @@ -0,0 +1,702 @@ |
| 94 | +/* |
| 95 | + * net/sched/sch_esfq.c Extended Stochastic Fairness Queueing discipline. |
| 96 | + * |
| 97 | + * This program is free software; you can redistribute it and/or |
| 98 | + * modify it under the terms of the GNU General Public License |
| 99 | + * as published by the Free Software Foundation; either version |
| 100 | + * 2 of the License, or (at your option) any later version. |
| 101 | + * |
| 102 | + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> |
| 103 | + * |
| 104 | + * Changes: Alexander Atanasov, <alex@ssi.bg> |
| 105 | + * Added dynamic depth,limit,divisor,hash_kind options. |
| 106 | + * Added dst and src hashes. |
| 107 | + * |
| 108 | + * Alexander Clouter, <alex@digriz.org.uk> |
| 109 | + * Ported ESFQ to Linux 2.6. |
| 110 | + * |
| 111 | + * Corey Hickey, <bugfood-c@fatooh.org> |
| 112 | + * Maintenance of the Linux 2.6 port. |
| 113 | + * Added fwmark hash (thanks to Robert Kurjata). |
| 114 | + * Added usage of jhash. |
| 115 | + * Added conntrack support. |
| 116 | + * Added ctnatchg hash (thanks to Ben Pfountz). |
| 117 | + */ |
| 118 | + |
| 119 | +#include <linux/module.h> |
| 120 | +#include <asm/uaccess.h> |
| 121 | +#include <asm/system.h> |
| 122 | +#include <linux/bitops.h> |
| 123 | +#include <linux/types.h> |
| 124 | +#include <linux/kernel.h> |
| 125 | +#include <linux/jiffies.h> |
| 126 | +#include <linux/string.h> |
| 127 | +#include <linux/mm.h> |
| 128 | +#include <linux/socket.h> |
| 129 | +#include <linux/sockios.h> |
| 130 | +#include <linux/in.h> |
| 131 | +#include <linux/errno.h> |
| 132 | +#include <linux/interrupt.h> |
| 133 | +#include <linux/if_ether.h> |
| 134 | +#include <linux/inet.h> |
| 135 | +#include <linux/netdevice.h> |
| 136 | +#include <linux/etherdevice.h> |
| 137 | +#include <linux/notifier.h> |
| 138 | +#include <linux/init.h> |
| 139 | +#include <net/ip.h> |
| 140 | +#include <net/netlink.h> |
| 141 | +#include <linux/ipv6.h> |
| 142 | +#include <net/route.h> |
| 143 | +#include <linux/skbuff.h> |
| 144 | +#include <net/sock.h> |
| 145 | +#include <net/pkt_sched.h> |
| 146 | +#include <linux/jhash.h> |
| 147 | +#ifdef CONFIG_NET_SCH_ESFQ_NFCT |
| 148 | +#include <net/netfilter/nf_conntrack.h> |
| 149 | +#endif |
| 150 | + |
| 151 | +/* Stochastic Fairness Queuing algorithm. |
| 152 | + For more comments look at sch_sfq.c. |
| 153 | + The difference is that you can change limit, depth, |
| 154 | + hash table size and choose alternate hash types. |
| 155 | + |
| 156 | + classic: same as in sch_sfq.c |
| 157 | + dst: destination IP address |
| 158 | + src: source IP address |
| 159 | + fwmark: netfilter mark value |
| 160 | + ctorigdst: original destination IP address |
| 161 | + ctorigsrc: original source IP address |
| 162 | + ctrepldst: reply destination IP address |
| 163 | + ctreplsrc: reply source IP |
| 164 | + |
| 165 | +*/ |
| 166 | + |
| 167 | +#define ESFQ_HEAD 0 |
| 168 | +#define ESFQ_TAIL 1 |
| 169 | + |
| 170 | +/* This type should contain at least SFQ_DEPTH*2 values */ |
| 171 | +typedef unsigned int esfq_index; |
| 172 | + |
| 173 | +struct esfq_head |
| 174 | +{ |
| 175 | + esfq_index next; |
| 176 | + esfq_index prev; |
| 177 | +}; |
| 178 | + |
| 179 | +struct esfq_sched_data |
| 180 | +{ |
| 181 | +/* Parameters */ |
| 182 | + int perturb_period; |
| 183 | + unsigned quantum; /* Allotment per round: MUST BE >= MTU */ |
| 184 | + int limit; |
| 185 | + unsigned depth; |
| 186 | + unsigned hash_divisor; |
| 187 | + unsigned hash_kind; |
| 188 | +/* Variables */ |
| 189 | + struct timer_list perturb_timer; |
| 190 | + int perturbation; |
| 191 | + esfq_index tail; /* Index of current slot in round */ |
| 192 | + esfq_index max_depth; /* Maximal depth */ |
| 193 | + |
| 194 | + esfq_index *ht; /* Hash table */ |
| 195 | + esfq_index *next; /* Active slots link */ |
| 196 | + short *allot; /* Current allotment per slot */ |
| 197 | + unsigned short *hash; /* Hash value indexed by slots */ |
| 198 | + struct sk_buff_head *qs; /* Slot queue */ |
| 199 | + struct esfq_head *dep; /* Linked list of slots, indexed by depth */ |
| 200 | +}; |
| 201 | + |
| 202 | +/* This contains the info we will hash. */ |
| 203 | +struct esfq_packet_info |
| 204 | +{ |
| 205 | + u32 proto; /* protocol or port */ |
| 206 | + u32 src; /* source from packet header */ |
| 207 | + u32 dst; /* destination from packet header */ |
| 208 | + u32 ctorigsrc; /* original source from conntrack */ |
| 209 | + u32 ctorigdst; /* original destination from conntrack */ |
| 210 | + u32 ctreplsrc; /* reply source from conntrack */ |
| 211 | + u32 ctrepldst; /* reply destination from conntrack */ |
| 212 | + u32 mark; /* netfilter mark (fwmark) */ |
| 213 | +}; |
| 214 | + |
| 215 | +static __inline__ unsigned esfq_jhash_1word(struct esfq_sched_data *q,u32 a) |
| 216 | +{ |
| 217 | + return jhash_1word(a, q->perturbation) & (q->hash_divisor-1); |
| 218 | +} |
| 219 | + |
| 220 | +static __inline__ unsigned esfq_jhash_2words(struct esfq_sched_data *q, u32 a, u32 b) |
| 221 | +{ |
| 222 | + return jhash_2words(a, b, q->perturbation) & (q->hash_divisor-1); |
| 223 | +} |
| 224 | + |
| 225 | +static __inline__ unsigned esfq_jhash_3words(struct esfq_sched_data *q, u32 a, u32 b, u32 c) |
| 226 | +{ |
| 227 | + return jhash_3words(a, b, c, q->perturbation) & (q->hash_divisor-1); |
| 228 | +} |
| 229 | + |
| 230 | +static unsigned esfq_hash(struct esfq_sched_data *q, struct sk_buff *skb) |
| 231 | +{ |
| 232 | + struct esfq_packet_info info; |
| 233 | +#ifdef CONFIG_NET_SCH_ESFQ_NFCT |
| 234 | + enum ip_conntrack_info ctinfo; |
| 235 | + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); |
| 236 | +#endif |
| 237 | + |
| 238 | + switch (skb->protocol) { |
| 239 | + case __constant_htons(ETH_P_IP): |
| 240 | + { |
| 241 | + struct iphdr *iph = ip_hdr(skb); |
| 242 | + info.dst = iph->daddr; |
| 243 | + info.src = iph->saddr; |
| 244 | + if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && |
| 245 | + (iph->protocol == IPPROTO_TCP || |
| 246 | + iph->protocol == IPPROTO_UDP || |
| 247 | + iph->protocol == IPPROTO_SCTP || |
| 248 | + iph->protocol == IPPROTO_DCCP || |
| 249 | + iph->protocol == IPPROTO_ESP)) |
| 250 | + info.proto = *(((u32*)iph) + iph->ihl); |
| 251 | + else |
| 252 | + info.proto = iph->protocol; |
| 253 | + break; |
| 254 | + } |
| 255 | + case __constant_htons(ETH_P_IPV6): |
| 256 | + { |
| 257 | + struct ipv6hdr *iph = ipv6_hdr(skb); |
| 258 | + /* Hash ipv6 addresses into a u32. This isn't ideal, |
| 259 | + * but the code is simple. */ |
| 260 | + info.dst = jhash2(iph->daddr.s6_addr32, 4, q->perturbation); |
| 261 | + info.src = jhash2(iph->saddr.s6_addr32, 4, q->perturbation); |
| 262 | + if (iph->nexthdr == IPPROTO_TCP || |
| 263 | + iph->nexthdr == IPPROTO_UDP || |
| 264 | + iph->nexthdr == IPPROTO_SCTP || |
| 265 | + iph->nexthdr == IPPROTO_DCCP || |
| 266 | + iph->nexthdr == IPPROTO_ESP) |
| 267 | + info.proto = *(u32*)&iph[1]; |
| 268 | + else |
| 269 | + info.proto = iph->nexthdr; |
| 270 | + break; |
| 271 | + } |
| 272 | + default: |
| 273 | + info.dst = (u32)(unsigned long)skb_dst(skb); |
| 274 | + info.src = (u32)(unsigned long)skb->sk; |
| 275 | + info.proto = skb->protocol; |
| 276 | + } |
| 277 | + |
| 278 | + info.mark = skb->mark; |
| 279 | + |
| 280 | +#ifdef CONFIG_NET_SCH_ESFQ_NFCT |
| 281 | + /* defaults if there is no conntrack info */ |
| 282 | + info.ctorigsrc = info.src; |
| 283 | + info.ctorigdst = info.dst; |
| 284 | + info.ctreplsrc = info.dst; |
| 285 | + info.ctrepldst = info.src; |
| 286 | + /* collect conntrack info */ |
| 287 | + if (ct && ct != &nf_conntrack_untracked) { |
| 288 | + if (skb->protocol == __constant_htons(ETH_P_IP)) { |
| 289 | + info.ctorigsrc = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip; |
| 290 | + info.ctorigdst = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip; |
| 291 | + info.ctreplsrc = ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip; |
| 292 | + info.ctrepldst = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip; |
| 293 | + } |
| 294 | + else if (skb->protocol == __constant_htons(ETH_P_IPV6)) { |
| 295 | + /* Again, hash ipv6 addresses into a single u32. */ |
| 296 | + info.ctorigsrc = jhash2(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip6, 4, q->perturbation); |
| 297 | + info.ctorigdst = jhash2(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip6, 4, q->perturbation); |
| 298 | + info.ctreplsrc = jhash2(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip6, 4, q->perturbation); |
| 299 | + info.ctrepldst = jhash2(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip6, 4, q->perturbation); |
| 300 | + } |
| 301 | + |
| 302 | + } |
| 303 | +#endif |
| 304 | + |
| 305 | + switch(q->hash_kind) { |
| 306 | + case TCA_SFQ_HASH_CLASSIC: |
| 307 | + return esfq_jhash_3words(q, info.dst, info.src, info.proto); |
| 308 | + case TCA_SFQ_HASH_DST: |
| 309 | + return esfq_jhash_1word(q, info.dst); |
| 310 | + case TCA_SFQ_HASH_SRC: |
| 311 | + return esfq_jhash_1word(q, info.src); |
| 312 | + case TCA_SFQ_HASH_FWMARK: |
| 313 | + return esfq_jhash_1word(q, info.mark); |
| 314 | +#ifdef CONFIG_NET_SCH_ESFQ_NFCT |
| 315 | + case TCA_SFQ_HASH_CTORIGDST: |
| 316 | + return esfq_jhash_1word(q, info.ctorigdst); |
| 317 | + case TCA_SFQ_HASH_CTORIGSRC: |
| 318 | + return esfq_jhash_1word(q, info.ctorigsrc); |
| 319 | + case TCA_SFQ_HASH_CTREPLDST: |
| 320 | + return esfq_jhash_1word(q, info.ctrepldst); |
| 321 | + case TCA_SFQ_HASH_CTREPLSRC: |
| 322 | + return esfq_jhash_1word(q, info.ctreplsrc); |
| 323 | + case TCA_SFQ_HASH_CTNATCHG: |
| 324 | + { |
| 325 | + if (info.ctorigdst == info.ctreplsrc) |
| 326 | + return esfq_jhash_1word(q, info.ctorigsrc); |
| 327 | + return esfq_jhash_1word(q, info.ctreplsrc); |
| 328 | + } |
| 329 | +#endif |
| 330 | + default: |
| 331 | + if (net_ratelimit()) |
| 332 | + printk(KERN_WARNING "ESFQ: Unknown hash method. Falling back to classic.\n"); |
| 333 | + } |
| 334 | + return esfq_jhash_3words(q, info.dst, info.src, info.proto); |
| 335 | +} |
| 336 | + |
| 337 | +static inline void esfq_link(struct esfq_sched_data *q, esfq_index x) |
| 338 | +{ |
| 339 | + esfq_index p, n; |
| 340 | + int d = q->qs[x].qlen + q->depth; |
| 341 | + |
| 342 | + p = d; |
| 343 | + n = q->dep[d].next; |
| 344 | + q->dep[x].next = n; |
| 345 | + q->dep[x].prev = p; |
| 346 | + q->dep[p].next = q->dep[n].prev = x; |
| 347 | +} |
| 348 | + |
| 349 | +static inline void esfq_dec(struct esfq_sched_data *q, esfq_index x) |
| 350 | +{ |
| 351 | + esfq_index p, n; |
| 352 | + |
| 353 | + n = q->dep[x].next; |
| 354 | + p = q->dep[x].prev; |
| 355 | + q->dep[p].next = n; |
| 356 | + q->dep[n].prev = p; |
| 357 | + |
| 358 | + if (n == p && q->max_depth == q->qs[x].qlen + 1) |
| 359 | + q->max_depth--; |
| 360 | + |
| 361 | + esfq_link(q, x); |
| 362 | +} |
| 363 | + |
| 364 | +static inline void esfq_inc(struct esfq_sched_data *q, esfq_index x) |
| 365 | +{ |
| 366 | + esfq_index p, n; |
| 367 | + int d; |
| 368 | + |
| 369 | + n = q->dep[x].next; |
| 370 | + p = q->dep[x].prev; |
| 371 | + q->dep[p].next = n; |
| 372 | + q->dep[n].prev = p; |
| 373 | + d = q->qs[x].qlen; |
| 374 | + if (q->max_depth < d) |
| 375 | + q->max_depth = d; |
| 376 | + |
| 377 | + esfq_link(q, x); |
| 378 | +} |
| 379 | + |
| 380 | +static unsigned int esfq_drop(struct Qdisc *sch) |
| 381 | +{ |
| 382 | + struct esfq_sched_data *q = qdisc_priv(sch); |
| 383 | + esfq_index d = q->max_depth; |
| 384 | + struct sk_buff *skb; |
| 385 | + unsigned int len; |
| 386 | + |
| 387 | + /* Queue is full! Find the longest slot and |
| 388 | + drop a packet from it */ |
| 389 | + |
| 390 | + if (d > 1) { |
| 391 | + esfq_index x = q->dep[d+q->depth].next; |
| 392 | + skb = q->qs[x].prev; |
| 393 | + len = skb->len; |
| 394 | + __skb_unlink(skb, &q->qs[x]); |
| 395 | + kfree_skb(skb); |
| 396 | + esfq_dec(q, x); |
| 397 | + sch->q.qlen--; |
| 398 | + sch->qstats.drops++; |
| 399 | + sch->qstats.backlog -= len; |
| 400 | + return len; |
| 401 | + } |
| 402 | + |
| 403 | + if (d == 1) { |
| 404 | + /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ |
| 405 | + d = q->next[q->tail]; |
| 406 | + q->next[q->tail] = q->next[d]; |
| 407 | + q->allot[q->next[d]] += q->quantum; |
| 408 | + skb = q->qs[d].prev; |
| 409 | + len = skb->len; |
| 410 | + __skb_unlink(skb, &q->qs[d]); |
| 411 | + kfree_skb(skb); |
| 412 | + esfq_dec(q, d); |
| 413 | + sch->q.qlen--; |
| 414 | + q->ht[q->hash[d]] = q->depth; |
| 415 | + sch->qstats.drops++; |
| 416 | + sch->qstats.backlog -= len; |
| 417 | + return len; |
| 418 | + } |
| 419 | + |
| 420 | + return 0; |
| 421 | +} |
| 422 | + |
| 423 | +static void esfq_q_enqueue(struct sk_buff *skb, struct esfq_sched_data *q, unsigned int end) |
| 424 | +{ |
| 425 | + unsigned hash = esfq_hash(q, skb); |
| 426 | + unsigned depth = q->depth; |
| 427 | + esfq_index x; |
| 428 | + |
| 429 | + x = q->ht[hash]; |
| 430 | + if (x == depth) { |
| 431 | + q->ht[hash] = x = q->dep[depth].next; |
| 432 | + q->hash[x] = hash; |
| 433 | + } |
| 434 | + |
| 435 | + if (end == ESFQ_TAIL) |
| 436 | + __skb_queue_tail(&q->qs[x], skb); |
| 437 | + else |
| 438 | + __skb_queue_head(&q->qs[x], skb); |
| 439 | + |
| 440 | + esfq_inc(q, x); |
| 441 | + if (q->qs[x].qlen == 1) { /* The flow is new */ |
| 442 | + if (q->tail == depth) { /* It is the first flow */ |
| 443 | + q->tail = x; |
| 444 | + q->next[x] = x; |
| 445 | + q->allot[x] = q->quantum; |
| 446 | + } else { |
| 447 | + q->next[x] = q->next[q->tail]; |
| 448 | + q->next[q->tail] = x; |
| 449 | + q->tail = x; |
| 450 | + } |
| 451 | + } |
| 452 | +} |
| 453 | + |
| 454 | +static int esfq_enqueue(struct sk_buff *skb, struct Qdisc* sch) |
| 455 | +{ |
| 456 | + struct esfq_sched_data *q = qdisc_priv(sch); |
| 457 | + esfq_q_enqueue(skb, q, ESFQ_TAIL); |
| 458 | + sch->qstats.backlog += skb->len; |
| 459 | + if (++sch->q.qlen < q->limit-1) { |
| 460 | + sch->bstats.bytes += skb->len; |
| 461 | + sch->bstats.packets++; |
| 462 | + return 0; |
| 463 | + } |
| 464 | + |
| 465 | + sch->qstats.drops++; |
| 466 | + esfq_drop(sch); |
| 467 | + return NET_XMIT_CN; |
| 468 | +} |
| 469 | + |
| 470 | +static struct sk_buff *esfq_peek(struct Qdisc* sch) |
| 471 | +{ |
| 472 | + struct esfq_sched_data *q = qdisc_priv(sch); |
| 473 | + esfq_index a; |
| 474 | + |
| 475 | + /* No active slots */ |
| 476 | + if (q->tail == q->depth) |
| 477 | + return NULL; |
| 478 | + |
| 479 | + a = q->next[q->tail]; |
| 480 | + return skb_peek(&q->qs[a]); |
| 481 | +} |
| 482 | + |
| 483 | +static struct sk_buff *esfq_q_dequeue(struct esfq_sched_data *q) |
| 484 | +{ |
| 485 | + struct sk_buff *skb; |
| 486 | + unsigned depth = q->depth; |
| 487 | + esfq_index a, old_a; |
| 488 | + |
| 489 | + /* No active slots */ |
| 490 | + if (q->tail == depth) |
| 491 | + return NULL; |
| 492 | + |
| 493 | + a = old_a = q->next[q->tail]; |
| 494 | + |
| 495 | + /* Grab packet */ |
| 496 | + skb = __skb_dequeue(&q->qs[a]); |
| 497 | + esfq_dec(q, a); |
| 498 | + |
| 499 | + /* Is the slot empty? */ |
| 500 | + if (q->qs[a].qlen == 0) { |
| 501 | + q->ht[q->hash[a]] = depth; |
| 502 | + a = q->next[a]; |
| 503 | + if (a == old_a) { |
| 504 | + q->tail = depth; |
| 505 | + return skb; |
| 506 | + } |
| 507 | + q->next[q->tail] = a; |
| 508 | + q->allot[a] += q->quantum; |
| 509 | + } else if ((q->allot[a] -= skb->len) <= 0) { |
| 510 | + q->tail = a; |
| 511 | + a = q->next[a]; |
| 512 | + q->allot[a] += q->quantum; |
| 513 | + } |
| 514 | + |
| 515 | + return skb; |
| 516 | +} |
| 517 | + |
| 518 | +static struct sk_buff *esfq_dequeue(struct Qdisc* sch) |
| 519 | +{ |
| 520 | + struct esfq_sched_data *q = qdisc_priv(sch); |
| 521 | + struct sk_buff *skb; |
| 522 | + |
| 523 | + skb = esfq_q_dequeue(q); |
| 524 | + if (skb == NULL) |
| 525 | + return NULL; |
| 526 | + sch->q.qlen--; |
| 527 | + sch->qstats.backlog -= skb->len; |
| 528 | + return skb; |
| 529 | +} |
| 530 | + |
| 531 | +static void esfq_q_destroy(struct esfq_sched_data *q) |
| 532 | +{ |
| 533 | + del_timer(&q->perturb_timer); |
| 534 | + if(q->ht) |
| 535 | + kfree(q->ht); |
| 536 | + if(q->dep) |
| 537 | + kfree(q->dep); |
| 538 | + if(q->next) |
| 539 | + kfree(q->next); |
| 540 | + if(q->allot) |
| 541 | + kfree(q->allot); |
| 542 | + if(q->hash) |
| 543 | + kfree(q->hash); |
| 544 | + if(q->qs) |
| 545 | + kfree(q->qs); |
| 546 | +} |
| 547 | + |
| 548 | +static void esfq_destroy(struct Qdisc *sch) |
| 549 | +{ |
| 550 | + struct esfq_sched_data *q = qdisc_priv(sch); |
| 551 | + esfq_q_destroy(q); |
| 552 | +} |
| 553 | + |
| 554 | + |
| 555 | +static void esfq_reset(struct Qdisc* sch) |
| 556 | +{ |
| 557 | + struct sk_buff *skb; |
| 558 | + |
| 559 | + while ((skb = esfq_dequeue(sch)) != NULL) |
| 560 | + kfree_skb(skb); |
| 561 | +} |
| 562 | + |
| 563 | +static void esfq_perturbation(unsigned long arg) |
| 564 | +{ |
| 565 | + struct Qdisc *sch = (struct Qdisc*)arg; |
| 566 | + struct esfq_sched_data *q = qdisc_priv(sch); |
| 567 | + |
| 568 | + q->perturbation = net_random()&0x1F; |
| 569 | + |
| 570 | + if (q->perturb_period) { |
| 571 | + q->perturb_timer.expires = jiffies + q->perturb_period; |
| 572 | + add_timer(&q->perturb_timer); |
| 573 | + } |
| 574 | +} |
| 575 | + |
| 576 | +static unsigned int esfq_check_hash(unsigned int kind) |
| 577 | +{ |
| 578 | + switch (kind) { |
| 579 | + case TCA_SFQ_HASH_CTORIGDST: |
| 580 | + case TCA_SFQ_HASH_CTORIGSRC: |
| 581 | + case TCA_SFQ_HASH_CTREPLDST: |
| 582 | + case TCA_SFQ_HASH_CTREPLSRC: |
| 583 | + case TCA_SFQ_HASH_CTNATCHG: |
| 584 | +#ifndef CONFIG_NET_SCH_ESFQ_NFCT |
| 585 | + { |
| 586 | + if (net_ratelimit()) |
| 587 | + printk(KERN_WARNING "ESFQ: Conntrack hash types disabled in kernel config. Falling back to classic.\n"); |
| 588 | + return TCA_SFQ_HASH_CLASSIC; |
| 589 | + } |
| 590 | +#endif |
| 591 | + case TCA_SFQ_HASH_CLASSIC: |
| 592 | + case TCA_SFQ_HASH_DST: |
| 593 | + case TCA_SFQ_HASH_SRC: |
| 594 | + case TCA_SFQ_HASH_FWMARK: |
| 595 | + return kind; |
| 596 | + default: |
| 597 | + { |
| 598 | + if (net_ratelimit()) |
| 599 | + printk(KERN_WARNING "ESFQ: Unknown hash type. Falling back to classic.\n"); |
| 600 | + return TCA_SFQ_HASH_CLASSIC; |
| 601 | + } |
| 602 | + } |
| 603 | +} |
| 604 | + |
| 605 | +static int esfq_q_init(struct esfq_sched_data *q, struct nlattr *opt) |
| 606 | +{ |
| 607 | + struct tc_esfq_qopt *ctl = nla_data(opt); |
| 608 | + esfq_index p = ~0U/2; |
| 609 | + int i; |
| 610 | + |
| 611 | + if (opt && opt->nla_len < nla_attr_size(sizeof(*ctl))) |
| 612 | + return -EINVAL; |
| 613 | + |
| 614 | + q->perturbation = 0; |
| 615 | + q->hash_kind = TCA_SFQ_HASH_CLASSIC; |
| 616 | + q->max_depth = 0; |
| 617 | + if (opt == NULL) { |
| 618 | + q->perturb_period = 0; |
| 619 | + q->hash_divisor = 1024; |
| 620 | + q->tail = q->limit = q->depth = 128; |
| 621 | + |
| 622 | + } else { |
| 623 | + struct tc_esfq_qopt *ctl = nla_data(opt); |
| 624 | + if (ctl->quantum) |
| 625 | + q->quantum = ctl->quantum; |
| 626 | + q->perturb_period = ctl->perturb_period*HZ; |
| 627 | + q->hash_divisor = ctl->divisor ? : 1024; |
| 628 | + q->tail = q->limit = q->depth = ctl->flows ? : 128; |
| 629 | + |
| 630 | + if ( q->depth > p - 1 ) |
| 631 | + return -EINVAL; |
| 632 | + |
| 633 | + if (ctl->limit) |
| 634 | + q->limit = min_t(u32, ctl->limit, q->depth); |
| 635 | + |
| 636 | + if (ctl->hash_kind) { |
| 637 | + q->hash_kind = esfq_check_hash(ctl->hash_kind); |
| 638 | + } |
| 639 | + } |
| 640 | + |
| 641 | + q->ht = kmalloc(q->hash_divisor*sizeof(esfq_index), GFP_KERNEL); |
| 642 | + if (!q->ht) |
| 643 | + goto err_case; |
| 644 | + q->dep = kmalloc((1+q->depth*2)*sizeof(struct esfq_head), GFP_KERNEL); |
| 645 | + if (!q->dep) |
| 646 | + goto err_case; |
| 647 | + q->next = kmalloc(q->depth*sizeof(esfq_index), GFP_KERNEL); |
| 648 | + if (!q->next) |
| 649 | + goto err_case; |
| 650 | + q->allot = kmalloc(q->depth*sizeof(short), GFP_KERNEL); |
| 651 | + if (!q->allot) |
| 652 | + goto err_case; |
| 653 | + q->hash = kmalloc(q->depth*sizeof(unsigned short), GFP_KERNEL); |
| 654 | + if (!q->hash) |
| 655 | + goto err_case; |
| 656 | + q->qs = kmalloc(q->depth*sizeof(struct sk_buff_head), GFP_KERNEL); |
| 657 | + if (!q->qs) |
| 658 | + goto err_case; |
| 659 | + |
| 660 | + for (i=0; i< q->hash_divisor; i++) |
| 661 | + q->ht[i] = q->depth; |
| 662 | + for (i=0; i<q->depth; i++) { |
| 663 | + skb_queue_head_init(&q->qs[i]); |
| 664 | + q->dep[i+q->depth].next = i+q->depth; |
| 665 | + q->dep[i+q->depth].prev = i+q->depth; |
| 666 | + } |
| 667 | + |
| 668 | + for (i=0; i<q->depth; i++) |
| 669 | + esfq_link(q, i); |
| 670 | + return 0; |
| 671 | +err_case: |
| 672 | + esfq_q_destroy(q); |
| 673 | + return -ENOBUFS; |
| 674 | +} |
| 675 | + |
| 676 | +static int esfq_init(struct Qdisc *sch, struct nlattr *opt) |
| 677 | +{ |
| 678 | + struct esfq_sched_data *q = qdisc_priv(sch); |
| 679 | + int err; |
| 680 | + |
| 681 | + q->quantum = psched_mtu(qdisc_dev(sch)); /* default */ |
| 682 | + if ((err = esfq_q_init(q, opt))) |
| 683 | + return err; |
| 684 | + |
| 685 | + init_timer(&q->perturb_timer); |
| 686 | + q->perturb_timer.data = (unsigned long)sch; |
| 687 | + q->perturb_timer.function = esfq_perturbation; |
| 688 | + if (q->perturb_period) { |
| 689 | + q->perturb_timer.expires = jiffies + q->perturb_period; |
| 690 | + add_timer(&q->perturb_timer); |
| 691 | + } |
| 692 | + |
| 693 | + return 0; |
| 694 | +} |
| 695 | + |
| 696 | +static int esfq_change(struct Qdisc *sch, struct nlattr *opt) |
| 697 | +{ |
| 698 | + struct esfq_sched_data *q = qdisc_priv(sch); |
| 699 | + struct esfq_sched_data new; |
| 700 | + struct sk_buff *skb; |
| 701 | + int err; |
| 702 | + |
| 703 | + /* set up new queue */ |
| 704 | + memset(&new, 0, sizeof(struct esfq_sched_data)); |
| 705 | + new.quantum = psched_mtu(qdisc_dev(sch)); /* default */ |
| 706 | + if ((err = esfq_q_init(&new, opt))) |
| 707 | + return err; |
| 708 | + |
| 709 | + /* copy all packets from the old queue to the new queue */ |
| 710 | + sch_tree_lock(sch); |
| 711 | + while ((skb = esfq_q_dequeue(q)) != NULL) |
| 712 | + esfq_q_enqueue(skb, &new, ESFQ_TAIL); |
| 713 | + |
| 714 | + /* clean up the old queue */ |
| 715 | + esfq_q_destroy(q); |
| 716 | + |
| 717 | + /* copy elements of the new queue into the old queue */ |
| 718 | + q->perturb_period = new.perturb_period; |
| 719 | + q->quantum = new.quantum; |
| 720 | + q->limit = new.limit; |
| 721 | + q->depth = new.depth; |
| 722 | + q->hash_divisor = new.hash_divisor; |
| 723 | + q->hash_kind = new.hash_kind; |
| 724 | + q->tail = new.tail; |
| 725 | + q->max_depth = new.max_depth; |
| 726 | + q->ht = new.ht; |
| 727 | + q->dep = new.dep; |
| 728 | + q->next = new.next; |
| 729 | + q->allot = new.allot; |
| 730 | + q->hash = new.hash; |
| 731 | + q->qs = new.qs; |
| 732 | + |
| 733 | + /* finish up */ |
| 734 | + if (q->perturb_period) { |
| 735 | + q->perturb_timer.expires = jiffies + q->perturb_period; |
| 736 | + add_timer(&q->perturb_timer); |
| 737 | + } else { |
| 738 | + q->perturbation = 0; |
| 739 | + } |
| 740 | + sch_tree_unlock(sch); |
| 741 | + return 0; |
| 742 | +} |
| 743 | + |
| 744 | +static int esfq_dump(struct Qdisc *sch, struct sk_buff *skb) |
| 745 | +{ |
| 746 | + struct esfq_sched_data *q = qdisc_priv(sch); |
| 747 | + unsigned char *b = skb_tail_pointer(skb); |
| 748 | + struct tc_esfq_qopt opt; |
| 749 | + |
| 750 | + opt.quantum = q->quantum; |
| 751 | + opt.perturb_period = q->perturb_period/HZ; |
| 752 | + |
| 753 | + opt.limit = q->limit; |
| 754 | + opt.divisor = q->hash_divisor; |
| 755 | + opt.flows = q->depth; |
| 756 | + opt.hash_kind = q->hash_kind; |
| 757 | + |
| 758 | + NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); |
| 759 | + |
| 760 | + return skb->len; |
| 761 | + |
| 762 | +nla_put_failure: |
| 763 | + nlmsg_trim(skb, b); |
| 764 | + return -1; |
| 765 | +} |
| 766 | + |
| 767 | +static struct Qdisc_ops esfq_qdisc_ops = |
| 768 | +{ |
| 769 | + .next = NULL, |
| 770 | + .cl_ops = NULL, |
| 771 | + .id = "esfq", |
| 772 | + .priv_size = sizeof(struct esfq_sched_data), |
| 773 | + .enqueue = esfq_enqueue, |
| 774 | + .dequeue = esfq_dequeue, |
| 775 | + .peek = esfq_peek, |
| 776 | + .drop = esfq_drop, |
| 777 | + .init = esfq_init, |
| 778 | + .reset = esfq_reset, |
| 779 | + .destroy = esfq_destroy, |
| 780 | + .change = esfq_change, |
| 781 | + .dump = esfq_dump, |
| 782 | + .owner = THIS_MODULE, |
| 783 | +}; |
| 784 | + |
| 785 | +static int __init esfq_module_init(void) |
| 786 | +{ |
| 787 | + return register_qdisc(&esfq_qdisc_ops); |
| 788 | +} |
| 789 | +static void __exit esfq_module_exit(void) |
| 790 | +{ |
| 791 | + unregister_qdisc(&esfq_qdisc_ops); |
| 792 | +} |
| 793 | +module_init(esfq_module_init) |
| 794 | +module_exit(esfq_module_exit) |
| 795 | +MODULE_LICENSE("GPL"); |
| 796 | |