mbuf: extend meaning of QinQ stripped bit
[dpdk.git] / lib / librte_sched / rte_sched.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4
5 #include <stdio.h>
6 #include <string.h>
7
8 #include <rte_common.h>
9 #include <rte_log.h>
10 #include <rte_memory.h>
11 #include <rte_malloc.h>
12 #include <rte_cycles.h>
13 #include <rte_prefetch.h>
14 #include <rte_branch_prediction.h>
15 #include <rte_mbuf.h>
16 #include <rte_bitmap.h>
17 #include <rte_reciprocal.h>
18
19 #include "rte_sched.h"
20 #include "rte_sched_common.h"
21 #include "rte_approx.h"
22
23 #ifdef __INTEL_COMPILER
24 #pragma warning(disable:2259) /* conversion may lose significant bits */
25 #endif
26
27 #ifdef RTE_SCHED_VECTOR
28 #include <rte_vect.h>
29
30 #ifdef RTE_ARCH_X86
31 #define SCHED_VECTOR_SSE4
32 #elif defined(RTE_MACHINE_CPUFLAG_NEON)
33 #define SCHED_VECTOR_NEON
34 #endif
35
36 #endif
37
38 #define RTE_SCHED_TB_RATE_CONFIG_ERR          (1e-7)
39 #define RTE_SCHED_WRR_SHIFT                   3
40 #define RTE_SCHED_MAX_QUEUES_PER_TC           RTE_SCHED_BE_QUEUES_PER_PIPE
41 #define RTE_SCHED_GRINDER_PCACHE_SIZE         (64 / RTE_SCHED_QUEUES_PER_PIPE)
42 #define RTE_SCHED_PIPE_INVALID                UINT32_MAX
43 #define RTE_SCHED_BMP_POS_INVALID             UINT32_MAX
44
45 /* Scaling for cycles_per_byte calculation
46  * Chosen so that minimum rate is 480 bit/sec
47  */
48 #define RTE_SCHED_TIME_SHIFT                  8
49
50 struct rte_sched_pipe_profile {
51         /* Token bucket (TB) */
52         uint64_t tb_period;
53         uint64_t tb_credits_per_period;
54         uint64_t tb_size;
55
56         /* Pipe traffic classes */
57         uint64_t tc_period;
58         uint64_t tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
59         uint8_t tc_ov_weight;
60
61         /* Pipe best-effort traffic class queues */
62         uint8_t  wrr_cost[RTE_SCHED_BE_QUEUES_PER_PIPE];
63 };
64
65 struct rte_sched_pipe {
66         /* Token bucket (TB) */
67         uint64_t tb_time; /* time of last update */
68         uint64_t tb_credits;
69
70         /* Pipe profile and flags */
71         uint32_t profile;
72
73         /* Traffic classes (TCs) */
74         uint64_t tc_time; /* time of next update */
75         uint64_t tc_credits[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
76
77         /* Weighted Round Robin (WRR) */
78         uint8_t wrr_tokens[RTE_SCHED_BE_QUEUES_PER_PIPE];
79
80         /* TC oversubscription */
81         uint64_t tc_ov_credits;
82         uint8_t tc_ov_period_id;
83 } __rte_cache_aligned;
84
85 struct rte_sched_queue {
86         uint16_t qw;
87         uint16_t qr;
88 };
89
90 struct rte_sched_queue_extra {
91         struct rte_sched_queue_stats stats;
92 #ifdef RTE_SCHED_RED
93         struct rte_red red;
94 #endif
95 };
96
97 enum grinder_state {
98         e_GRINDER_PREFETCH_PIPE = 0,
99         e_GRINDER_PREFETCH_TC_QUEUE_ARRAYS,
100         e_GRINDER_PREFETCH_MBUF,
101         e_GRINDER_READ_MBUF
102 };
103
104 struct rte_sched_grinder {
105         /* Pipe cache */
106         uint16_t pcache_qmask[RTE_SCHED_GRINDER_PCACHE_SIZE];
107         uint32_t pcache_qindex[RTE_SCHED_GRINDER_PCACHE_SIZE];
108         uint32_t pcache_w;
109         uint32_t pcache_r;
110
111         /* Current pipe */
112         enum grinder_state state;
113         uint32_t productive;
114         uint32_t pindex;
115         struct rte_sched_subport *subport;
116         struct rte_sched_pipe *pipe;
117         struct rte_sched_pipe_profile *pipe_params;
118
119         /* TC cache */
120         uint8_t tccache_qmask[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
121         uint32_t tccache_qindex[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
122         uint32_t tccache_w;
123         uint32_t tccache_r;
124
125         /* Current TC */
126         uint32_t tc_index;
127         struct rte_sched_queue *queue[RTE_SCHED_MAX_QUEUES_PER_TC];
128         struct rte_mbuf **qbase[RTE_SCHED_MAX_QUEUES_PER_TC];
129         uint32_t qindex[RTE_SCHED_MAX_QUEUES_PER_TC];
130         uint16_t qsize;
131         uint32_t qmask;
132         uint32_t qpos;
133         struct rte_mbuf *pkt;
134
135         /* WRR */
136         uint16_t wrr_tokens[RTE_SCHED_BE_QUEUES_PER_PIPE];
137         uint16_t wrr_mask[RTE_SCHED_BE_QUEUES_PER_PIPE];
138         uint8_t wrr_cost[RTE_SCHED_BE_QUEUES_PER_PIPE];
139 };
140
141 struct rte_sched_subport {
142         /* Token bucket (TB) */
143         uint64_t tb_time; /* time of last update */
144         uint64_t tb_period;
145         uint64_t tb_credits_per_period;
146         uint64_t tb_size;
147         uint64_t tb_credits;
148
149         /* Traffic classes (TCs) */
150         uint64_t tc_time; /* time of next update */
151         uint64_t tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
152         uint64_t tc_credits[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
153         uint64_t tc_period;
154
155         /* TC oversubscription */
156         uint64_t tc_ov_wm;
157         uint64_t tc_ov_wm_min;
158         uint64_t tc_ov_wm_max;
159         uint8_t tc_ov_period_id;
160         uint8_t tc_ov;
161         uint32_t tc_ov_n;
162         double tc_ov_rate;
163
164         /* Statistics */
165         struct rte_sched_subport_stats stats __rte_cache_aligned;
166
167         /* Subport pipes */
168         uint32_t n_pipes_per_subport_enabled;
169         uint32_t n_pipe_profiles;
170         uint32_t n_max_pipe_profiles;
171
172         /* Pipe best-effort TC rate */
173         uint64_t pipe_tc_be_rate_max;
174
175         /* Pipe queues size */
176         uint16_t qsize[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
177
178 #ifdef RTE_SCHED_RED
179         struct rte_red_config red_config[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE][RTE_COLORS];
180 #endif
181
182         /* Scheduling loop detection */
183         uint32_t pipe_loop;
184         uint32_t pipe_exhaustion;
185
186         /* Bitmap */
187         struct rte_bitmap *bmp;
188         uint32_t grinder_base_bmp_pos[RTE_SCHED_PORT_N_GRINDERS] __rte_aligned_16;
189
190         /* Grinders */
191         struct rte_sched_grinder grinder[RTE_SCHED_PORT_N_GRINDERS];
192         uint32_t busy_grinders;
193
194         /* Queue base calculation */
195         uint32_t qsize_add[RTE_SCHED_QUEUES_PER_PIPE];
196         uint32_t qsize_sum;
197
198         struct rte_sched_pipe *pipe;
199         struct rte_sched_queue *queue;
200         struct rte_sched_queue_extra *queue_extra;
201         struct rte_sched_pipe_profile *pipe_profiles;
202         uint8_t *bmp_array;
203         struct rte_mbuf **queue_array;
204         uint8_t memory[0] __rte_cache_aligned;
205 } __rte_cache_aligned;
206
207 struct rte_sched_port {
208         /* User parameters */
209         uint32_t n_subports_per_port;
210         uint32_t n_pipes_per_subport;
211         uint32_t n_pipes_per_subport_log2;
212         uint16_t pipe_queue[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
213         uint8_t pipe_tc[RTE_SCHED_QUEUES_PER_PIPE];
214         uint8_t tc_queue[RTE_SCHED_QUEUES_PER_PIPE];
215         uint64_t rate;
216         uint32_t mtu;
217         uint32_t frame_overhead;
218         int socket;
219
220         /* Timing */
221         uint64_t time_cpu_cycles;     /* Current CPU time measured in CPU cyles */
222         uint64_t time_cpu_bytes;      /* Current CPU time measured in bytes */
223         uint64_t time;                /* Current NIC TX time measured in bytes */
224         struct rte_reciprocal inv_cycles_per_byte; /* CPU cycles per byte */
225         uint64_t cycles_per_byte;
226
227         /* Grinders */
228         struct rte_mbuf **pkts_out;
229         uint32_t n_pkts_out;
230         uint32_t subport_id;
231
232         /* Large data structures */
233         struct rte_sched_subport *subports[0] __rte_cache_aligned;
234 } __rte_cache_aligned;
235
236 enum rte_sched_subport_array {
237         e_RTE_SCHED_SUBPORT_ARRAY_PIPE = 0,
238         e_RTE_SCHED_SUBPORT_ARRAY_QUEUE,
239         e_RTE_SCHED_SUBPORT_ARRAY_QUEUE_EXTRA,
240         e_RTE_SCHED_SUBPORT_ARRAY_PIPE_PROFILES,
241         e_RTE_SCHED_SUBPORT_ARRAY_BMP_ARRAY,
242         e_RTE_SCHED_SUBPORT_ARRAY_QUEUE_ARRAY,
243         e_RTE_SCHED_SUBPORT_ARRAY_TOTAL,
244 };
245
246 static inline uint32_t
247 rte_sched_subport_pipe_queues(struct rte_sched_subport *subport)
248 {
249         return RTE_SCHED_QUEUES_PER_PIPE * subport->n_pipes_per_subport_enabled;
250 }
251
252 static inline struct rte_mbuf **
253 rte_sched_subport_pipe_qbase(struct rte_sched_subport *subport, uint32_t qindex)
254 {
255         uint32_t pindex = qindex >> 4;
256         uint32_t qpos = qindex & (RTE_SCHED_QUEUES_PER_PIPE - 1);
257
258         return (subport->queue_array + pindex *
259                 subport->qsize_sum + subport->qsize_add[qpos]);
260 }
261
262 static inline uint16_t
263 rte_sched_subport_pipe_qsize(struct rte_sched_port *port,
264 struct rte_sched_subport *subport, uint32_t qindex)
265 {
266         uint32_t tc = port->pipe_tc[qindex & (RTE_SCHED_QUEUES_PER_PIPE - 1)];
267
268         return subport->qsize[tc];
269 }
270
271 static inline uint32_t
272 rte_sched_port_queues_per_port(struct rte_sched_port *port)
273 {
274         uint32_t n_queues = 0, i;
275
276         for (i = 0; i < port->n_subports_per_port; i++)
277                 n_queues += rte_sched_subport_pipe_queues(port->subports[i]);
278
279         return n_queues;
280 }
281
282 static inline uint16_t
283 rte_sched_port_pipe_queue(struct rte_sched_port *port, uint32_t traffic_class)
284 {
285         uint16_t pipe_queue = port->pipe_queue[traffic_class];
286
287         return pipe_queue;
288 }
289
290 static inline uint8_t
291 rte_sched_port_pipe_tc(struct rte_sched_port *port, uint32_t qindex)
292 {
293         uint8_t pipe_tc = port->pipe_tc[qindex & (RTE_SCHED_QUEUES_PER_PIPE - 1)];
294
295         return pipe_tc;
296 }
297
298 static inline uint8_t
299 rte_sched_port_tc_queue(struct rte_sched_port *port, uint32_t qindex)
300 {
301         uint8_t tc_queue = port->tc_queue[qindex & (RTE_SCHED_QUEUES_PER_PIPE - 1)];
302
303         return tc_queue;
304 }
305
306 static int
307 pipe_profile_check(struct rte_sched_pipe_params *params,
308         uint64_t rate, uint16_t *qsize)
309 {
310         uint32_t i;
311
312         /* Pipe parameters */
313         if (params == NULL) {
314                 RTE_LOG(ERR, SCHED,
315                         "%s: Incorrect value for parameter params\n", __func__);
316                 return -EINVAL;
317         }
318
319         /* TB rate: non-zero, not greater than port rate */
320         if (params->tb_rate == 0 ||
321                 params->tb_rate > rate) {
322                 RTE_LOG(ERR, SCHED,
323                         "%s: Incorrect value for tb rate\n", __func__);
324                 return -EINVAL;
325         }
326
327         /* TB size: non-zero */
328         if (params->tb_size == 0) {
329                 RTE_LOG(ERR, SCHED,
330                         "%s: Incorrect value for tb size\n", __func__);
331                 return -EINVAL;
332         }
333
334         /* TC rate: non-zero if qsize non-zero, less than pipe rate */
335         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) {
336                 if ((qsize[i] == 0 && params->tc_rate[i] != 0) ||
337                         (qsize[i] != 0 && (params->tc_rate[i] == 0 ||
338                         params->tc_rate[i] > params->tb_rate))) {
339                         RTE_LOG(ERR, SCHED,
340                                 "%s: Incorrect value for qsize or tc_rate\n", __func__);
341                         return -EINVAL;
342                 }
343         }
344
345         if (params->tc_rate[RTE_SCHED_TRAFFIC_CLASS_BE] == 0 ||
346                 qsize[RTE_SCHED_TRAFFIC_CLASS_BE] == 0) {
347                 RTE_LOG(ERR, SCHED,
348                         "%s: Incorrect value for be traffic class rate\n", __func__);
349                 return -EINVAL;
350         }
351
352         /* TC period: non-zero */
353         if (params->tc_period == 0) {
354                 RTE_LOG(ERR, SCHED,
355                         "%s: Incorrect value for tc period\n", __func__);
356                 return -EINVAL;
357         }
358
359         /*  Best effort tc oversubscription weight: non-zero */
360         if (params->tc_ov_weight == 0) {
361                 RTE_LOG(ERR, SCHED,
362                         "%s: Incorrect value for tc ov weight\n", __func__);
363                 return -EINVAL;
364         }
365
366         /* Queue WRR weights: non-zero */
367         for (i = 0; i < RTE_SCHED_BE_QUEUES_PER_PIPE; i++) {
368                 if (params->wrr_weights[i] == 0) {
369                         RTE_LOG(ERR, SCHED,
370                                 "%s: Incorrect value for wrr weight\n", __func__);
371                         return -EINVAL;
372                 }
373         }
374
375         return 0;
376 }
377
378 static int
379 rte_sched_port_check_params(struct rte_sched_port_params *params)
380 {
381         if (params == NULL) {
382                 RTE_LOG(ERR, SCHED,
383                         "%s: Incorrect value for parameter params\n", __func__);
384                 return -EINVAL;
385         }
386
387         /* socket */
388         if (params->socket < 0) {
389                 RTE_LOG(ERR, SCHED,
390                         "%s: Incorrect value for socket id\n", __func__);
391                 return -EINVAL;
392         }
393
394         /* rate */
395         if (params->rate == 0) {
396                 RTE_LOG(ERR, SCHED,
397                         "%s: Incorrect value for rate\n", __func__);
398                 return -EINVAL;
399         }
400
401         /* mtu */
402         if (params->mtu == 0) {
403                 RTE_LOG(ERR, SCHED,
404                         "%s: Incorrect value for mtu\n", __func__);
405                 return -EINVAL;
406         }
407
408         /* n_subports_per_port: non-zero, limited to 16 bits, power of 2 */
409         if (params->n_subports_per_port == 0 ||
410             params->n_subports_per_port > 1u << 16 ||
411             !rte_is_power_of_2(params->n_subports_per_port)) {
412                 RTE_LOG(ERR, SCHED,
413                         "%s: Incorrect value for number of subports\n", __func__);
414                 return -EINVAL;
415         }
416
417         /* n_pipes_per_subport: non-zero, power of 2 */
418         if (params->n_pipes_per_subport == 0 ||
419             !rte_is_power_of_2(params->n_pipes_per_subport)) {
420                 RTE_LOG(ERR, SCHED,
421                         "%s: Incorrect value for maximum pipes number\n", __func__);
422                 return -EINVAL;
423         }
424
425         return 0;
426 }
427
428 static uint32_t
429 rte_sched_subport_get_array_base(struct rte_sched_subport_params *params,
430         enum rte_sched_subport_array array)
431 {
432         uint32_t n_pipes_per_subport = params->n_pipes_per_subport_enabled;
433         uint32_t n_subport_pipe_queues =
434                 RTE_SCHED_QUEUES_PER_PIPE * n_pipes_per_subport;
435
436         uint32_t size_pipe = n_pipes_per_subport * sizeof(struct rte_sched_pipe);
437         uint32_t size_queue =
438                 n_subport_pipe_queues * sizeof(struct rte_sched_queue);
439         uint32_t size_queue_extra
440                 = n_subport_pipe_queues * sizeof(struct rte_sched_queue_extra);
441         uint32_t size_pipe_profiles = params->n_max_pipe_profiles *
442                 sizeof(struct rte_sched_pipe_profile);
443         uint32_t size_bmp_array =
444                 rte_bitmap_get_memory_footprint(n_subport_pipe_queues);
445         uint32_t size_per_pipe_queue_array, size_queue_array;
446
447         uint32_t base, i;
448
449         size_per_pipe_queue_array = 0;
450         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) {
451                 if (i < RTE_SCHED_TRAFFIC_CLASS_BE)
452                         size_per_pipe_queue_array +=
453                                 params->qsize[i] * sizeof(struct rte_mbuf *);
454                 else
455                         size_per_pipe_queue_array += RTE_SCHED_MAX_QUEUES_PER_TC *
456                                 params->qsize[i] * sizeof(struct rte_mbuf *);
457         }
458         size_queue_array = n_pipes_per_subport * size_per_pipe_queue_array;
459
460         base = 0;
461
462         if (array == e_RTE_SCHED_SUBPORT_ARRAY_PIPE)
463                 return base;
464         base += RTE_CACHE_LINE_ROUNDUP(size_pipe);
465
466         if (array == e_RTE_SCHED_SUBPORT_ARRAY_QUEUE)
467                 return base;
468         base += RTE_CACHE_LINE_ROUNDUP(size_queue);
469
470         if (array == e_RTE_SCHED_SUBPORT_ARRAY_QUEUE_EXTRA)
471                 return base;
472         base += RTE_CACHE_LINE_ROUNDUP(size_queue_extra);
473
474         if (array == e_RTE_SCHED_SUBPORT_ARRAY_PIPE_PROFILES)
475                 return base;
476         base += RTE_CACHE_LINE_ROUNDUP(size_pipe_profiles);
477
478         if (array == e_RTE_SCHED_SUBPORT_ARRAY_BMP_ARRAY)
479                 return base;
480         base += RTE_CACHE_LINE_ROUNDUP(size_bmp_array);
481
482         if (array == e_RTE_SCHED_SUBPORT_ARRAY_QUEUE_ARRAY)
483                 return base;
484         base += RTE_CACHE_LINE_ROUNDUP(size_queue_array);
485
486         return base;
487 }
488
489 static void
490 rte_sched_subport_config_qsize(struct rte_sched_subport *subport)
491 {
492         uint32_t i;
493
494         subport->qsize_add[0] = 0;
495
496         /* Strict prority traffic class */
497         for (i = 1; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
498                 subport->qsize_add[i] = subport->qsize_add[i-1] + subport->qsize[i-1];
499
500         /* Best-effort traffic class */
501         subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE + 1] =
502                 subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE] +
503                 subport->qsize[RTE_SCHED_TRAFFIC_CLASS_BE];
504         subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE + 2] =
505                 subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE + 1] +
506                 subport->qsize[RTE_SCHED_TRAFFIC_CLASS_BE];
507         subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE + 3] =
508                 subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE + 2] +
509                 subport->qsize[RTE_SCHED_TRAFFIC_CLASS_BE];
510
511         subport->qsize_sum = subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE + 3] +
512                 subport->qsize[RTE_SCHED_TRAFFIC_CLASS_BE];
513 }
514
515 static void
516 rte_sched_port_log_pipe_profile(struct rte_sched_subport *subport, uint32_t i)
517 {
518         struct rte_sched_pipe_profile *p = subport->pipe_profiles + i;
519
520         RTE_LOG(DEBUG, SCHED, "Low level config for pipe profile %u:\n"
521                 "       Token bucket: period = %"PRIu64", credits per period = %"PRIu64", size = %"PRIu64"\n"
522                 "       Traffic classes: period = %"PRIu64",\n"
523                 "       credits per period = [%"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64
524                 ", %"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64
525                 ", %"PRIu64", %"PRIu64", %"PRIu64"]\n"
526                 "       Best-effort traffic class oversubscription: weight = %hhu\n"
527                 "       WRR cost: [%hhu, %hhu, %hhu, %hhu]\n",
528                 i,
529
530                 /* Token bucket */
531                 p->tb_period,
532                 p->tb_credits_per_period,
533                 p->tb_size,
534
535                 /* Traffic classes */
536                 p->tc_period,
537                 p->tc_credits_per_period[0],
538                 p->tc_credits_per_period[1],
539                 p->tc_credits_per_period[2],
540                 p->tc_credits_per_period[3],
541                 p->tc_credits_per_period[4],
542                 p->tc_credits_per_period[5],
543                 p->tc_credits_per_period[6],
544                 p->tc_credits_per_period[7],
545                 p->tc_credits_per_period[8],
546                 p->tc_credits_per_period[9],
547                 p->tc_credits_per_period[10],
548                 p->tc_credits_per_period[11],
549                 p->tc_credits_per_period[12],
550
551                 /* Best-effort traffic class oversubscription */
552                 p->tc_ov_weight,
553
554                 /* WRR */
555                 p->wrr_cost[0], p->wrr_cost[1], p->wrr_cost[2], p->wrr_cost[3]);
556 }
557
558 static inline uint64_t
559 rte_sched_time_ms_to_bytes(uint64_t time_ms, uint64_t rate)
560 {
561         uint64_t time = time_ms;
562
563         time = (time * rate) / 1000;
564
565         return time;
566 }
567
568 static void
569 rte_sched_pipe_profile_convert(struct rte_sched_subport *subport,
570         struct rte_sched_pipe_params *src,
571         struct rte_sched_pipe_profile *dst,
572         uint64_t rate)
573 {
574         uint32_t wrr_cost[RTE_SCHED_BE_QUEUES_PER_PIPE];
575         uint32_t lcd1, lcd2, lcd;
576         uint32_t i;
577
578         /* Token Bucket */
579         if (src->tb_rate == rate) {
580                 dst->tb_credits_per_period = 1;
581                 dst->tb_period = 1;
582         } else {
583                 double tb_rate = (double) src->tb_rate
584                                 / (double) rate;
585                 double d = RTE_SCHED_TB_RATE_CONFIG_ERR;
586
587                 rte_approx_64(tb_rate, d, &dst->tb_credits_per_period,
588                         &dst->tb_period);
589         }
590
591         dst->tb_size = src->tb_size;
592
593         /* Traffic Classes */
594         dst->tc_period = rte_sched_time_ms_to_bytes(src->tc_period,
595                                                 rate);
596
597         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
598                 if (subport->qsize[i])
599                         dst->tc_credits_per_period[i]
600                                 = rte_sched_time_ms_to_bytes(src->tc_period,
601                                         src->tc_rate[i]);
602
603         dst->tc_ov_weight = src->tc_ov_weight;
604
605         /* WRR queues */
606         wrr_cost[0] = src->wrr_weights[0];
607         wrr_cost[1] = src->wrr_weights[1];
608         wrr_cost[2] = src->wrr_weights[2];
609         wrr_cost[3] = src->wrr_weights[3];
610
611         lcd1 = rte_get_lcd(wrr_cost[0], wrr_cost[1]);
612         lcd2 = rte_get_lcd(wrr_cost[2], wrr_cost[3]);
613         lcd = rte_get_lcd(lcd1, lcd2);
614
615         wrr_cost[0] = lcd / wrr_cost[0];
616         wrr_cost[1] = lcd / wrr_cost[1];
617         wrr_cost[2] = lcd / wrr_cost[2];
618         wrr_cost[3] = lcd / wrr_cost[3];
619
620         dst->wrr_cost[0] = (uint8_t) wrr_cost[0];
621         dst->wrr_cost[1] = (uint8_t) wrr_cost[1];
622         dst->wrr_cost[2] = (uint8_t) wrr_cost[2];
623         dst->wrr_cost[3] = (uint8_t) wrr_cost[3];
624 }
625
626 static void
627 rte_sched_subport_config_pipe_profile_table(struct rte_sched_subport *subport,
628         struct rte_sched_subport_params *params, uint64_t rate)
629 {
630         uint32_t i;
631
632         for (i = 0; i < subport->n_pipe_profiles; i++) {
633                 struct rte_sched_pipe_params *src = params->pipe_profiles + i;
634                 struct rte_sched_pipe_profile *dst = subport->pipe_profiles + i;
635
636                 rte_sched_pipe_profile_convert(subport, src, dst, rate);
637                 rte_sched_port_log_pipe_profile(subport, i);
638         }
639
640         subport->pipe_tc_be_rate_max = 0;
641         for (i = 0; i < subport->n_pipe_profiles; i++) {
642                 struct rte_sched_pipe_params *src = params->pipe_profiles + i;
643                 uint64_t pipe_tc_be_rate = src->tc_rate[RTE_SCHED_TRAFFIC_CLASS_BE];
644
645                 if (subport->pipe_tc_be_rate_max < pipe_tc_be_rate)
646                         subport->pipe_tc_be_rate_max = pipe_tc_be_rate;
647         }
648 }
649
650 static int
651 rte_sched_subport_check_params(struct rte_sched_subport_params *params,
652         uint32_t n_max_pipes_per_subport,
653         uint64_t rate)
654 {
655         uint32_t i;
656
657         /* Check user parameters */
658         if (params == NULL) {
659                 RTE_LOG(ERR, SCHED,
660                         "%s: Incorrect value for parameter params\n", __func__);
661                 return -EINVAL;
662         }
663
664         if (params->tb_rate == 0 || params->tb_rate > rate) {
665                 RTE_LOG(ERR, SCHED,
666                         "%s: Incorrect value for tb rate\n", __func__);
667                 return -EINVAL;
668         }
669
670         if (params->tb_size == 0) {
671                 RTE_LOG(ERR, SCHED,
672                         "%s: Incorrect value for tb size\n", __func__);
673                 return -EINVAL;
674         }
675
676         /* qsize: if non-zero, power of 2,
677          * no bigger than 32K (due to 16-bit read/write pointers)
678          */
679         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) {
680                 uint16_t qsize = params->qsize[i];
681
682                 if (qsize != 0 && !rte_is_power_of_2(qsize)) {
683                         RTE_LOG(ERR, SCHED,
684                                 "%s: Incorrect value for qsize\n", __func__);
685                         return -EINVAL;
686                 }
687         }
688
689         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) {
690                 uint64_t tc_rate = params->tc_rate[i];
691                 uint16_t qsize = params->qsize[i];
692
693                 if ((qsize == 0 && tc_rate != 0) ||
694                         (qsize != 0 && tc_rate == 0) ||
695                         (tc_rate > params->tb_rate)) {
696                         RTE_LOG(ERR, SCHED,
697                                 "%s: Incorrect value for tc rate\n", __func__);
698                         return -EINVAL;
699                 }
700         }
701
702         if (params->qsize[RTE_SCHED_TRAFFIC_CLASS_BE] == 0 ||
703                 params->tc_rate[RTE_SCHED_TRAFFIC_CLASS_BE] == 0) {
704                 RTE_LOG(ERR, SCHED,
705                         "%s: Incorrect qsize or tc rate(best effort)\n", __func__);
706                 return -EINVAL;
707         }
708
709         if (params->tc_period == 0) {
710                 RTE_LOG(ERR, SCHED,
711                         "%s: Incorrect value for tc period\n", __func__);
712                 return -EINVAL;
713         }
714
715         /* n_pipes_per_subport: non-zero, power of 2 */
716         if (params->n_pipes_per_subport_enabled == 0 ||
717                 params->n_pipes_per_subport_enabled > n_max_pipes_per_subport ||
718             !rte_is_power_of_2(params->n_pipes_per_subport_enabled)) {
719                 RTE_LOG(ERR, SCHED,
720                         "%s: Incorrect value for pipes number\n", __func__);
721                 return -EINVAL;
722         }
723
724         /* pipe_profiles and n_pipe_profiles */
725         if (params->pipe_profiles == NULL ||
726             params->n_pipe_profiles == 0 ||
727                 params->n_max_pipe_profiles == 0 ||
728                 params->n_pipe_profiles > params->n_max_pipe_profiles) {
729                 RTE_LOG(ERR, SCHED,
730                         "%s: Incorrect value for pipe profiles\n", __func__);
731                 return -EINVAL;
732         }
733
734         for (i = 0; i < params->n_pipe_profiles; i++) {
735                 struct rte_sched_pipe_params *p = params->pipe_profiles + i;
736                 int status;
737
738                 status = pipe_profile_check(p, rate, &params->qsize[0]);
739                 if (status != 0) {
740                         RTE_LOG(ERR, SCHED,
741                                 "%s: Pipe profile check failed(%d)\n", __func__, status);
742                         return -EINVAL;
743                 }
744         }
745
746         return 0;
747 }
748
749 uint32_t
750 rte_sched_port_get_memory_footprint(struct rte_sched_port_params *port_params,
751         struct rte_sched_subport_params **subport_params)
752 {
753         uint32_t size0 = 0, size1 = 0, i;
754         int status;
755
756         status = rte_sched_port_check_params(port_params);
757         if (status != 0) {
758                 RTE_LOG(ERR, SCHED,
759                         "%s: Port scheduler port params check failed (%d)\n",
760                         __func__, status);
761
762                 return 0;
763         }
764
765         for (i = 0; i < port_params->n_subports_per_port; i++) {
766                 struct rte_sched_subport_params *sp = subport_params[i];
767
768                 status = rte_sched_subport_check_params(sp,
769                                 port_params->n_pipes_per_subport,
770                                 port_params->rate);
771                 if (status != 0) {
772                         RTE_LOG(ERR, SCHED,
773                                 "%s: Port scheduler subport params check failed (%d)\n",
774                                 __func__, status);
775
776                         return 0;
777                 }
778         }
779
780         size0 = sizeof(struct rte_sched_port);
781
782         for (i = 0; i < port_params->n_subports_per_port; i++) {
783                 struct rte_sched_subport_params *sp = subport_params[i];
784
785                 size1 += rte_sched_subport_get_array_base(sp,
786                                         e_RTE_SCHED_SUBPORT_ARRAY_TOTAL);
787         }
788
789         return size0 + size1;
790 }
791
792 struct rte_sched_port *
793 rte_sched_port_config(struct rte_sched_port_params *params)
794 {
795         struct rte_sched_port *port = NULL;
796         uint32_t size0, size1;
797         uint32_t cycles_per_byte;
798         uint32_t i, j;
799         int status;
800
801         status = rte_sched_port_check_params(params);
802         if (status != 0) {
803                 RTE_LOG(ERR, SCHED,
804                         "%s: Port scheduler params check failed (%d)\n",
805                         __func__, status);
806                 return NULL;
807         }
808
809         size0 = sizeof(struct rte_sched_port);
810         size1 = params->n_subports_per_port * sizeof(struct rte_sched_subport *);
811
812         /* Allocate memory to store the data structures */
813         port = rte_zmalloc_socket("qos_params", size0 + size1, RTE_CACHE_LINE_SIZE,
814                 params->socket);
815         if (port == NULL) {
816                 RTE_LOG(ERR, SCHED, "%s: Memory allocation fails\n", __func__);
817
818                 return NULL;
819         }
820
821         /* User parameters */
822         port->n_subports_per_port = params->n_subports_per_port;
823         port->n_pipes_per_subport = params->n_pipes_per_subport;
824         port->n_pipes_per_subport_log2 =
825                         __builtin_ctz(params->n_pipes_per_subport);
826         port->socket = params->socket;
827
828         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
829                 port->pipe_queue[i] = i;
830
831         for (i = 0, j = 0; i < RTE_SCHED_QUEUES_PER_PIPE; i++) {
832                 port->pipe_tc[i] = j;
833
834                 if (j < RTE_SCHED_TRAFFIC_CLASS_BE)
835                         j++;
836         }
837
838         for (i = 0, j = 0; i < RTE_SCHED_QUEUES_PER_PIPE; i++) {
839                 port->tc_queue[i] = j;
840
841                 if (i >= RTE_SCHED_TRAFFIC_CLASS_BE)
842                         j++;
843         }
844         port->rate = params->rate;
845         port->mtu = params->mtu + params->frame_overhead;
846         port->frame_overhead = params->frame_overhead;
847
848         /* Timing */
849         port->time_cpu_cycles = rte_get_tsc_cycles();
850         port->time_cpu_bytes = 0;
851         port->time = 0;
852
853         cycles_per_byte = (rte_get_tsc_hz() << RTE_SCHED_TIME_SHIFT)
854                 / params->rate;
855         port->inv_cycles_per_byte = rte_reciprocal_value(cycles_per_byte);
856         port->cycles_per_byte = cycles_per_byte;
857
858         /* Grinders */
859         port->pkts_out = NULL;
860         port->n_pkts_out = 0;
861         port->subport_id = 0;
862
863         return port;
864 }
865
866 static inline void
867 rte_sched_subport_free(struct rte_sched_port *port,
868         struct rte_sched_subport *subport)
869 {
870         uint32_t n_subport_pipe_queues;
871         uint32_t qindex;
872
873         if (subport == NULL)
874                 return;
875
876         n_subport_pipe_queues = rte_sched_subport_pipe_queues(subport);
877
878         /* Free enqueued mbufs */
879         for (qindex = 0; qindex < n_subport_pipe_queues; qindex++) {
880                 struct rte_mbuf **mbufs =
881                         rte_sched_subport_pipe_qbase(subport, qindex);
882                 uint16_t qsize = rte_sched_subport_pipe_qsize(port, subport, qindex);
883                 if (qsize != 0) {
884                         struct rte_sched_queue *queue = subport->queue + qindex;
885                         uint16_t qr = queue->qr & (qsize - 1);
886                         uint16_t qw = queue->qw & (qsize - 1);
887
888                         for (; qr != qw; qr = (qr + 1) & (qsize - 1))
889                                 rte_pktmbuf_free(mbufs[qr]);
890                 }
891         }
892
893         rte_free(subport);
894 }
895
896 void
897 rte_sched_port_free(struct rte_sched_port *port)
898 {
899         uint32_t i;
900
901         /* Check user parameters */
902         if (port == NULL)
903                 return;
904
905         for (i = 0; i < port->n_subports_per_port; i++)
906                 rte_sched_subport_free(port, port->subports[i]);
907
908         rte_free(port);
909 }
910
911 static void
912 rte_sched_port_log_subport_config(struct rte_sched_port *port, uint32_t i)
913 {
914         struct rte_sched_subport *s = port->subports[i];
915
916         RTE_LOG(DEBUG, SCHED, "Low level config for subport %u:\n"
917                 "       Token bucket: period = %"PRIu64", credits per period = %"PRIu64
918                 ", size = %"PRIu64"\n"
919                 "       Traffic classes: period = %"PRIu64"\n"
920                 "       credits per period = [%"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64
921                 ", %"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64
922                 ", %"PRIu64", %"PRIu64", %"PRIu64"]\n"
923                 "       Best effort traffic class oversubscription: wm min = %"PRIu64
924                 ", wm max = %"PRIu64"\n",
925                 i,
926
927                 /* Token bucket */
928                 s->tb_period,
929                 s->tb_credits_per_period,
930                 s->tb_size,
931
932                 /* Traffic classes */
933                 s->tc_period,
934                 s->tc_credits_per_period[0],
935                 s->tc_credits_per_period[1],
936                 s->tc_credits_per_period[2],
937                 s->tc_credits_per_period[3],
938                 s->tc_credits_per_period[4],
939                 s->tc_credits_per_period[5],
940                 s->tc_credits_per_period[6],
941                 s->tc_credits_per_period[7],
942                 s->tc_credits_per_period[8],
943                 s->tc_credits_per_period[9],
944                 s->tc_credits_per_period[10],
945                 s->tc_credits_per_period[11],
946                 s->tc_credits_per_period[12],
947
948                 /* Best effort traffic class oversubscription */
949                 s->tc_ov_wm_min,
950                 s->tc_ov_wm_max);
951 }
952
953 static void
954 rte_sched_free_memory(struct rte_sched_port *port, uint32_t n_subports)
955 {
956         uint32_t i;
957
958         for (i = 0; i < n_subports; i++) {
959                 struct rte_sched_subport *subport = port->subports[i];
960
961                 rte_sched_subport_free(port, subport);
962         }
963
964         rte_free(port);
965 }
966
967 int
968 rte_sched_subport_config(struct rte_sched_port *port,
969         uint32_t subport_id,
970         struct rte_sched_subport_params *params)
971 {
972         struct rte_sched_subport *s = NULL;
973         uint32_t n_subports = subport_id;
974         uint32_t n_subport_pipe_queues, i;
975         uint32_t size0, size1, bmp_mem_size;
976         int status;
977
978         /* Check user parameters */
979         if (port == NULL) {
980                 RTE_LOG(ERR, SCHED,
981                         "%s: Incorrect value for parameter port\n", __func__);
982                 return 0;
983         }
984
985         if (subport_id >= port->n_subports_per_port) {
986                 RTE_LOG(ERR, SCHED,
987                         "%s: Incorrect value for subport id\n", __func__);
988
989                 rte_sched_free_memory(port, n_subports);
990                 return -EINVAL;
991         }
992
993         status = rte_sched_subport_check_params(params,
994                 port->n_pipes_per_subport,
995                 port->rate);
996         if (status != 0) {
997                 RTE_LOG(NOTICE, SCHED,
998                         "%s: Port scheduler params check failed (%d)\n",
999                         __func__, status);
1000
1001                 rte_sched_free_memory(port, n_subports);
1002                 return -EINVAL;
1003         }
1004
1005         /* Determine the amount of memory to allocate */
1006         size0 = sizeof(struct rte_sched_subport);
1007         size1 = rte_sched_subport_get_array_base(params,
1008                                 e_RTE_SCHED_SUBPORT_ARRAY_TOTAL);
1009
1010         /* Allocate memory to store the data structures */
1011         s = rte_zmalloc_socket("subport_params", size0 + size1,
1012                 RTE_CACHE_LINE_SIZE, port->socket);
1013         if (s == NULL) {
1014                 RTE_LOG(ERR, SCHED,
1015                         "%s: Memory allocation fails\n", __func__);
1016
1017                 rte_sched_free_memory(port, n_subports);
1018                 return -ENOMEM;
1019         }
1020
1021         n_subports++;
1022
1023         /* Port */
1024         port->subports[subport_id] = s;
1025
1026         /* Token Bucket (TB) */
1027         if (params->tb_rate == port->rate) {
1028                 s->tb_credits_per_period = 1;
1029                 s->tb_period = 1;
1030         } else {
1031                 double tb_rate = ((double) params->tb_rate) / ((double) port->rate);
1032                 double d = RTE_SCHED_TB_RATE_CONFIG_ERR;
1033
1034                 rte_approx_64(tb_rate, d, &s->tb_credits_per_period, &s->tb_period);
1035         }
1036
1037         s->tb_size = params->tb_size;
1038         s->tb_time = port->time;
1039         s->tb_credits = s->tb_size / 2;
1040
1041         /* Traffic Classes (TCs) */
1042         s->tc_period = rte_sched_time_ms_to_bytes(params->tc_period, port->rate);
1043         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) {
1044                 if (params->qsize[i])
1045                         s->tc_credits_per_period[i]
1046                                 = rte_sched_time_ms_to_bytes(params->tc_period,
1047                                         params->tc_rate[i]);
1048         }
1049         s->tc_time = port->time + s->tc_period;
1050         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
1051                 if (params->qsize[i])
1052                         s->tc_credits[i] = s->tc_credits_per_period[i];
1053
1054         /* compile time checks */
1055         RTE_BUILD_BUG_ON(RTE_SCHED_PORT_N_GRINDERS == 0);
1056         RTE_BUILD_BUG_ON(RTE_SCHED_PORT_N_GRINDERS &
1057                 (RTE_SCHED_PORT_N_GRINDERS - 1));
1058
1059         /* User parameters */
1060         s->n_pipes_per_subport_enabled = params->n_pipes_per_subport_enabled;
1061         memcpy(s->qsize, params->qsize, sizeof(params->qsize));
1062         s->n_pipe_profiles = params->n_pipe_profiles;
1063         s->n_max_pipe_profiles = params->n_max_pipe_profiles;
1064
1065 #ifdef RTE_SCHED_RED
1066         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) {
1067                 uint32_t j;
1068
1069                 for (j = 0; j < RTE_COLORS; j++) {
1070                         /* if min/max are both zero, then RED is disabled */
1071                         if ((params->red_params[i][j].min_th |
1072                              params->red_params[i][j].max_th) == 0) {
1073                                 continue;
1074                         }
1075
1076                         if (rte_red_config_init(&s->red_config[i][j],
1077                                 params->red_params[i][j].wq_log2,
1078                                 params->red_params[i][j].min_th,
1079                                 params->red_params[i][j].max_th,
1080                                 params->red_params[i][j].maxp_inv) != 0) {
1081                                 rte_sched_free_memory(port, n_subports);
1082
1083                                 RTE_LOG(NOTICE, SCHED,
1084                                 "%s: RED configuration init fails\n", __func__);
1085                                 return -EINVAL;
1086                         }
1087                 }
1088         }
1089 #endif
1090
1091         /* Scheduling loop detection */
1092         s->pipe_loop = RTE_SCHED_PIPE_INVALID;
1093         s->pipe_exhaustion = 0;
1094
1095         /* Grinders */
1096         s->busy_grinders = 0;
1097
1098         /* Queue base calculation */
1099         rte_sched_subport_config_qsize(s);
1100
1101         /* Large data structures */
1102         s->pipe = (struct rte_sched_pipe *)
1103                 (s->memory + rte_sched_subport_get_array_base(params,
1104                 e_RTE_SCHED_SUBPORT_ARRAY_PIPE));
1105         s->queue = (struct rte_sched_queue *)
1106                 (s->memory + rte_sched_subport_get_array_base(params,
1107                 e_RTE_SCHED_SUBPORT_ARRAY_QUEUE));
1108         s->queue_extra = (struct rte_sched_queue_extra *)
1109                 (s->memory + rte_sched_subport_get_array_base(params,
1110                 e_RTE_SCHED_SUBPORT_ARRAY_QUEUE_EXTRA));
1111         s->pipe_profiles = (struct rte_sched_pipe_profile *)
1112                 (s->memory + rte_sched_subport_get_array_base(params,
1113                 e_RTE_SCHED_SUBPORT_ARRAY_PIPE_PROFILES));
1114         s->bmp_array =  s->memory + rte_sched_subport_get_array_base(params,
1115                 e_RTE_SCHED_SUBPORT_ARRAY_BMP_ARRAY);
1116         s->queue_array = (struct rte_mbuf **)
1117                 (s->memory + rte_sched_subport_get_array_base(params,
1118                 e_RTE_SCHED_SUBPORT_ARRAY_QUEUE_ARRAY));
1119
1120         /* Pipe profile table */
1121         rte_sched_subport_config_pipe_profile_table(s, params, port->rate);
1122
1123         /* Bitmap */
1124         n_subport_pipe_queues = rte_sched_subport_pipe_queues(s);
1125         bmp_mem_size = rte_bitmap_get_memory_footprint(n_subport_pipe_queues);
1126         s->bmp = rte_bitmap_init(n_subport_pipe_queues, s->bmp_array,
1127                                 bmp_mem_size);
1128         if (s->bmp == NULL) {
1129                 RTE_LOG(ERR, SCHED,
1130                         "%s: Subport bitmap init error\n", __func__);
1131
1132                 rte_sched_free_memory(port, n_subports);
1133                 return -EINVAL;
1134         }
1135
1136         for (i = 0; i < RTE_SCHED_PORT_N_GRINDERS; i++)
1137                 s->grinder_base_bmp_pos[i] = RTE_SCHED_PIPE_INVALID;
1138
1139 #ifdef RTE_SCHED_SUBPORT_TC_OV
1140         /* TC oversubscription */
1141         s->tc_ov_wm_min = port->mtu;
1142         s->tc_ov_wm_max = rte_sched_time_ms_to_bytes(params->tc_period,
1143                                                      s->pipe_tc_be_rate_max);
1144         s->tc_ov_wm = s->tc_ov_wm_max;
1145         s->tc_ov_period_id = 0;
1146         s->tc_ov = 0;
1147         s->tc_ov_n = 0;
1148         s->tc_ov_rate = 0;
1149 #endif
1150
1151         rte_sched_port_log_subport_config(port, subport_id);
1152
1153         return 0;
1154 }
1155
1156 int
1157 rte_sched_pipe_config(struct rte_sched_port *port,
1158         uint32_t subport_id,
1159         uint32_t pipe_id,
1160         int32_t pipe_profile)
1161 {
1162         struct rte_sched_subport *s;
1163         struct rte_sched_pipe *p;
1164         struct rte_sched_pipe_profile *params;
1165         uint32_t n_subports = subport_id + 1;
1166         uint32_t deactivate, profile, i;
1167
1168         /* Check user parameters */
1169         profile = (uint32_t) pipe_profile;
1170         deactivate = (pipe_profile < 0);
1171
1172         if (port == NULL) {
1173                 RTE_LOG(ERR, SCHED,
1174                         "%s: Incorrect value for parameter port\n", __func__);
1175                 return -EINVAL;
1176         }
1177
1178         if (subport_id >= port->n_subports_per_port) {
1179                 RTE_LOG(ERR, SCHED,
1180                         "%s: Incorrect value for parameter subport id\n", __func__);
1181
1182                 rte_sched_free_memory(port, n_subports);
1183                 return -EINVAL;
1184         }
1185
1186         s = port->subports[subport_id];
1187         if (pipe_id >= s->n_pipes_per_subport_enabled) {
1188                 RTE_LOG(ERR, SCHED,
1189                         "%s: Incorrect value for parameter pipe id\n", __func__);
1190
1191                 rte_sched_free_memory(port, n_subports);
1192                 return -EINVAL;
1193         }
1194
1195         if (!deactivate && profile >= s->n_pipe_profiles) {
1196                 RTE_LOG(ERR, SCHED,
1197                         "%s: Incorrect value for parameter pipe profile\n", __func__);
1198
1199                 rte_sched_free_memory(port, n_subports);
1200                 return -EINVAL;
1201         }
1202
1203         /* Handle the case when pipe already has a valid configuration */
1204         p = s->pipe + pipe_id;
1205         if (p->tb_time) {
1206                 params = s->pipe_profiles + p->profile;
1207
1208                 double subport_tc_be_rate =
1209                         (double) s->tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASS_BE]
1210                         / (double) s->tc_period;
1211                 double pipe_tc_be_rate =
1212                         (double) params->tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASS_BE]
1213                         / (double) params->tc_period;
1214                 uint32_t tc_be_ov = s->tc_ov;
1215
1216                 /* Unplug pipe from its subport */
1217                 s->tc_ov_n -= params->tc_ov_weight;
1218                 s->tc_ov_rate -= pipe_tc_be_rate;
1219                 s->tc_ov = s->tc_ov_rate > subport_tc_be_rate;
1220
1221                 if (s->tc_ov != tc_be_ov) {
1222                         RTE_LOG(DEBUG, SCHED,
1223                                 "Subport %u Best-effort TC oversubscription is OFF (%.4lf >= %.4lf)\n",
1224                                 subport_id, subport_tc_be_rate, s->tc_ov_rate);
1225                 }
1226
1227                 /* Reset the pipe */
1228                 memset(p, 0, sizeof(struct rte_sched_pipe));
1229         }
1230
1231         if (deactivate)
1232                 return 0;
1233
1234         /* Apply the new pipe configuration */
1235         p->profile = profile;
1236         params = s->pipe_profiles + p->profile;
1237
1238         /* Token Bucket (TB) */
1239         p->tb_time = port->time;
1240         p->tb_credits = params->tb_size / 2;
1241
1242         /* Traffic Classes (TCs) */
1243         p->tc_time = port->time + params->tc_period;
1244
1245         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
1246                 if (s->qsize[i])
1247                         p->tc_credits[i] = params->tc_credits_per_period[i];
1248
1249         {
1250                 /* Subport best effort tc oversubscription */
1251                 double subport_tc_be_rate =
1252                         (double) s->tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASS_BE]
1253                         / (double) s->tc_period;
1254                 double pipe_tc_be_rate =
1255                         (double) params->tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASS_BE]
1256                         / (double) params->tc_period;
1257                 uint32_t tc_be_ov = s->tc_ov;
1258
1259                 s->tc_ov_n += params->tc_ov_weight;
1260                 s->tc_ov_rate += pipe_tc_be_rate;
1261                 s->tc_ov = s->tc_ov_rate > subport_tc_be_rate;
1262
1263                 if (s->tc_ov != tc_be_ov) {
1264                         RTE_LOG(DEBUG, SCHED,
1265                                 "Subport %u Best effort TC oversubscription is ON (%.4lf < %.4lf)\n",
1266                                 subport_id, subport_tc_be_rate, s->tc_ov_rate);
1267                 }
1268                 p->tc_ov_period_id = s->tc_ov_period_id;
1269                 p->tc_ov_credits = s->tc_ov_wm;
1270         }
1271
1272         return 0;
1273 }
1274
1275 int
1276 rte_sched_subport_pipe_profile_add(struct rte_sched_port *port,
1277         uint32_t subport_id,
1278         struct rte_sched_pipe_params *params,
1279         uint32_t *pipe_profile_id)
1280 {
1281         struct rte_sched_subport *s;
1282         struct rte_sched_pipe_profile *pp;
1283         uint32_t i;
1284         int status;
1285
1286         /* Port */
1287         if (port == NULL) {
1288                 RTE_LOG(ERR, SCHED,
1289                         "%s: Incorrect value for parameter port\n", __func__);
1290                 return -EINVAL;
1291         }
1292
1293         /* Subport id not exceeds the max limit */
1294         if (subport_id > port->n_subports_per_port) {
1295                 RTE_LOG(ERR, SCHED,
1296                         "%s: Incorrect value for subport id\n", __func__);
1297                 return -EINVAL;
1298         }
1299
1300         s = port->subports[subport_id];
1301
1302         /* Pipe profiles exceeds the max limit */
1303         if (s->n_pipe_profiles >= s->n_max_pipe_profiles) {
1304                 RTE_LOG(ERR, SCHED,
1305                         "%s: Number of pipe profiles exceeds the max limit\n", __func__);
1306                 return -EINVAL;
1307         }
1308
1309         /* Pipe params */
1310         status = pipe_profile_check(params, port->rate, &s->qsize[0]);
1311         if (status != 0) {
1312                 RTE_LOG(ERR, SCHED,
1313                         "%s: Pipe profile check failed(%d)\n", __func__, status);
1314                 return -EINVAL;
1315         }
1316
1317         pp = &s->pipe_profiles[s->n_pipe_profiles];
1318         rte_sched_pipe_profile_convert(s, params, pp, port->rate);
1319
1320         /* Pipe profile should not exists */
1321         for (i = 0; i < s->n_pipe_profiles; i++)
1322                 if (memcmp(s->pipe_profiles + i, pp, sizeof(*pp)) == 0) {
1323                         RTE_LOG(ERR, SCHED,
1324                                 "%s: Pipe profile exists\n", __func__);
1325                         return -EINVAL;
1326                 }
1327
1328         /* Pipe profile commit */
1329         *pipe_profile_id = s->n_pipe_profiles;
1330         s->n_pipe_profiles++;
1331
1332         if (s->pipe_tc_be_rate_max < params->tc_rate[RTE_SCHED_TRAFFIC_CLASS_BE])
1333                 s->pipe_tc_be_rate_max = params->tc_rate[RTE_SCHED_TRAFFIC_CLASS_BE];
1334
1335         rte_sched_port_log_pipe_profile(s, *pipe_profile_id);
1336
1337         return 0;
1338 }
1339
1340 static inline uint32_t
1341 rte_sched_port_qindex(struct rte_sched_port *port,
1342         uint32_t subport,
1343         uint32_t pipe,
1344         uint32_t traffic_class,
1345         uint32_t queue)
1346 {
1347         return ((subport & (port->n_subports_per_port - 1)) <<
1348                 (port->n_pipes_per_subport_log2 + 4)) |
1349                 ((pipe &
1350                 (port->subports[subport]->n_pipes_per_subport_enabled - 1)) << 4) |
1351                 ((rte_sched_port_pipe_queue(port, traffic_class) + queue) &
1352                 (RTE_SCHED_QUEUES_PER_PIPE - 1));
1353 }
1354
1355 void
1356 rte_sched_port_pkt_write(struct rte_sched_port *port,
1357                          struct rte_mbuf *pkt,
1358                          uint32_t subport, uint32_t pipe,
1359                          uint32_t traffic_class,
1360                          uint32_t queue, enum rte_color color)
1361 {
1362         uint32_t queue_id =
1363                 rte_sched_port_qindex(port, subport, pipe, traffic_class, queue);
1364
1365         rte_mbuf_sched_set(pkt, queue_id, traffic_class, (uint8_t)color);
1366 }
1367
1368 void
1369 rte_sched_port_pkt_read_tree_path(struct rte_sched_port *port,
1370                                   const struct rte_mbuf *pkt,
1371                                   uint32_t *subport, uint32_t *pipe,
1372                                   uint32_t *traffic_class, uint32_t *queue)
1373 {
1374         uint32_t queue_id = rte_mbuf_sched_queue_get(pkt);
1375
1376         *subport = queue_id >> (port->n_pipes_per_subport_log2 + 4);
1377         *pipe = (queue_id >> 4) &
1378                 (port->subports[*subport]->n_pipes_per_subport_enabled - 1);
1379         *traffic_class = rte_sched_port_pipe_tc(port, queue_id);
1380         *queue = rte_sched_port_tc_queue(port, queue_id);
1381 }
1382
1383 enum rte_color
1384 rte_sched_port_pkt_read_color(const struct rte_mbuf *pkt)
1385 {
1386         return (enum rte_color)rte_mbuf_sched_color_get(pkt);
1387 }
1388
1389 int
1390 rte_sched_subport_read_stats(struct rte_sched_port *port,
1391                              uint32_t subport_id,
1392                              struct rte_sched_subport_stats *stats,
1393                              uint32_t *tc_ov)
1394 {
1395         struct rte_sched_subport *s;
1396
1397         /* Check user parameters */
1398         if (port == NULL) {
1399                 RTE_LOG(ERR, SCHED,
1400                         "%s: Incorrect value for parameter port\n", __func__);
1401                 return -EINVAL;
1402         }
1403
1404         if (subport_id >= port->n_subports_per_port) {
1405                 RTE_LOG(ERR, SCHED,
1406                         "%s: Incorrect value for subport id\n", __func__);
1407                 return -EINVAL;
1408         }
1409
1410         if (stats == NULL) {
1411                 RTE_LOG(ERR, SCHED,
1412                         "%s: Incorrect value for parameter stats\n", __func__);
1413                 return -EINVAL;
1414         }
1415
1416         if (tc_ov == NULL) {
1417                 RTE_LOG(ERR, SCHED,
1418                         "%s: Incorrect value for tc_ov\n", __func__);
1419                 return -EINVAL;
1420         }
1421
1422         s = port->subports[subport_id];
1423
1424         /* Copy subport stats and clear */
1425         memcpy(stats, &s->stats, sizeof(struct rte_sched_subport_stats));
1426         memset(&s->stats, 0, sizeof(struct rte_sched_subport_stats));
1427
1428         /* Subport TC oversubscription status */
1429         *tc_ov = s->tc_ov;
1430
1431         return 0;
1432 }
1433
1434 int
1435 rte_sched_queue_read_stats(struct rte_sched_port *port,
1436         uint32_t queue_id,
1437         struct rte_sched_queue_stats *stats,
1438         uint16_t *qlen)
1439 {
1440         struct rte_sched_subport *s;
1441         struct rte_sched_queue *q;
1442         struct rte_sched_queue_extra *qe;
1443         uint32_t subport_id, subport_qmask, subport_qindex;
1444
1445         /* Check user parameters */
1446         if (port == NULL) {
1447                 RTE_LOG(ERR, SCHED,
1448                         "%s: Incorrect value for parameter port\n", __func__);
1449                 return -EINVAL;
1450         }
1451
1452         if (queue_id >= rte_sched_port_queues_per_port(port)) {
1453                 RTE_LOG(ERR, SCHED,
1454                         "%s: Incorrect value for queue id\n", __func__);
1455                 return -EINVAL;
1456         }
1457
1458         if (stats == NULL) {
1459                 RTE_LOG(ERR, SCHED,
1460                         "%s: Incorrect value for parameter stats\n", __func__);
1461                 return -EINVAL;
1462         }
1463
1464         if (qlen == NULL) {
1465                 RTE_LOG(ERR, SCHED,
1466                         "%s: Incorrect value for parameter qlen\n", __func__);
1467                 return -EINVAL;
1468         }
1469         subport_qmask = port->n_pipes_per_subport_log2 + 4;
1470         subport_id = (queue_id >> subport_qmask) & (port->n_subports_per_port - 1);
1471
1472         s = port->subports[subport_id];
1473         subport_qindex = ((1 << subport_qmask) - 1) & queue_id;
1474         q = s->queue + subport_qindex;
1475         qe = s->queue_extra + subport_qindex;
1476
1477         /* Copy queue stats and clear */
1478         memcpy(stats, &qe->stats, sizeof(struct rte_sched_queue_stats));
1479         memset(&qe->stats, 0, sizeof(struct rte_sched_queue_stats));
1480
1481         /* Queue length */
1482         *qlen = q->qw - q->qr;
1483
1484         return 0;
1485 }
1486
1487 #ifdef RTE_SCHED_DEBUG
1488
1489 static inline int
1490 rte_sched_port_queue_is_empty(struct rte_sched_subport *subport,
1491         uint32_t qindex)
1492 {
1493         struct rte_sched_queue *queue = subport->queue + qindex;
1494
1495         return queue->qr == queue->qw;
1496 }
1497
1498 #endif /* RTE_SCHED_DEBUG */
1499
1500 #ifdef RTE_SCHED_COLLECT_STATS
1501
1502 static inline void
1503 rte_sched_port_update_subport_stats(struct rte_sched_port *port,
1504         struct rte_sched_subport *subport,
1505         uint32_t qindex,
1506         struct rte_mbuf *pkt)
1507 {
1508         uint32_t tc_index = rte_sched_port_pipe_tc(port, qindex);
1509         uint32_t pkt_len = pkt->pkt_len;
1510
1511         subport->stats.n_pkts_tc[tc_index] += 1;
1512         subport->stats.n_bytes_tc[tc_index] += pkt_len;
1513 }
1514
1515 #ifdef RTE_SCHED_RED
1516 static inline void
1517 rte_sched_port_update_subport_stats_on_drop(struct rte_sched_port *port,
1518         struct rte_sched_subport *subport,
1519         uint32_t qindex,
1520         struct rte_mbuf *pkt,
1521         uint32_t red)
1522 #else
1523 static inline void
1524 rte_sched_port_update_subport_stats_on_drop(struct rte_sched_port *port,
1525         struct rte_sched_subport *subport,
1526         uint32_t qindex,
1527         struct rte_mbuf *pkt,
1528         __rte_unused uint32_t red)
1529 #endif
1530 {
1531         uint32_t tc_index = rte_sched_port_pipe_tc(port, qindex);
1532         uint32_t pkt_len = pkt->pkt_len;
1533
1534         subport->stats.n_pkts_tc_dropped[tc_index] += 1;
1535         subport->stats.n_bytes_tc_dropped[tc_index] += pkt_len;
1536 #ifdef RTE_SCHED_RED
1537         subport->stats.n_pkts_red_dropped[tc_index] += red;
1538 #endif
1539 }
1540
1541 static inline void
1542 rte_sched_port_update_queue_stats(struct rte_sched_subport *subport,
1543         uint32_t qindex,
1544         struct rte_mbuf *pkt)
1545 {
1546         struct rte_sched_queue_extra *qe = subport->queue_extra + qindex;
1547         uint32_t pkt_len = pkt->pkt_len;
1548
1549         qe->stats.n_pkts += 1;
1550         qe->stats.n_bytes += pkt_len;
1551 }
1552
1553 #ifdef RTE_SCHED_RED
1554 static inline void
1555 rte_sched_port_update_queue_stats_on_drop(struct rte_sched_subport *subport,
1556         uint32_t qindex,
1557         struct rte_mbuf *pkt,
1558         uint32_t red)
1559 #else
1560 static inline void
1561 rte_sched_port_update_queue_stats_on_drop(struct rte_sched_subport *subport,
1562         uint32_t qindex,
1563         struct rte_mbuf *pkt,
1564         __rte_unused uint32_t red)
1565 #endif
1566 {
1567         struct rte_sched_queue_extra *qe = subport->queue_extra + qindex;
1568         uint32_t pkt_len = pkt->pkt_len;
1569
1570         qe->stats.n_pkts_dropped += 1;
1571         qe->stats.n_bytes_dropped += pkt_len;
1572 #ifdef RTE_SCHED_RED
1573         qe->stats.n_pkts_red_dropped += red;
1574 #endif
1575 }
1576
1577 #endif /* RTE_SCHED_COLLECT_STATS */
1578
1579 #ifdef RTE_SCHED_RED
1580
1581 static inline int
1582 rte_sched_port_red_drop(struct rte_sched_port *port,
1583         struct rte_sched_subport *subport,
1584         struct rte_mbuf *pkt,
1585         uint32_t qindex,
1586         uint16_t qlen)
1587 {
1588         struct rte_sched_queue_extra *qe;
1589         struct rte_red_config *red_cfg;
1590         struct rte_red *red;
1591         uint32_t tc_index;
1592         enum rte_color color;
1593
1594         tc_index = rte_sched_port_pipe_tc(port, qindex);
1595         color = rte_sched_port_pkt_read_color(pkt);
1596         red_cfg = &subport->red_config[tc_index][color];
1597
1598         if ((red_cfg->min_th | red_cfg->max_th) == 0)
1599                 return 0;
1600
1601         qe = subport->queue_extra + qindex;
1602         red = &qe->red;
1603
1604         return rte_red_enqueue(red_cfg, red, qlen, port->time);
1605 }
1606
1607 static inline void
1608 rte_sched_port_set_queue_empty_timestamp(struct rte_sched_port *port,
1609         struct rte_sched_subport *subport, uint32_t qindex)
1610 {
1611         struct rte_sched_queue_extra *qe = subport->queue_extra + qindex;
1612         struct rte_red *red = &qe->red;
1613
1614         rte_red_mark_queue_empty(red, port->time);
1615 }
1616
1617 #else
1618
1619 static inline int rte_sched_port_red_drop(struct rte_sched_port *port __rte_unused,
1620         struct rte_sched_subport *subport __rte_unused,
1621         struct rte_mbuf *pkt __rte_unused,
1622         uint32_t qindex __rte_unused,
1623         uint16_t qlen __rte_unused)
1624 {
1625         return 0;
1626 }
1627
1628 #define rte_sched_port_set_queue_empty_timestamp(port, subport, qindex)
1629
1630 #endif /* RTE_SCHED_RED */
1631
1632 #ifdef RTE_SCHED_DEBUG
1633
1634 static inline void
1635 debug_check_queue_slab(struct rte_sched_subport *subport, uint32_t bmp_pos,
1636                        uint64_t bmp_slab)
1637 {
1638         uint64_t mask;
1639         uint32_t i, panic;
1640
1641         if (bmp_slab == 0)
1642                 rte_panic("Empty slab at position %u\n", bmp_pos);
1643
1644         panic = 0;
1645         for (i = 0, mask = 1; i < 64; i++, mask <<= 1) {
1646                 if (mask & bmp_slab) {
1647                         if (rte_sched_port_queue_is_empty(subport, bmp_pos + i)) {
1648                                 printf("Queue %u (slab offset %u) is empty\n", bmp_pos + i, i);
1649                                 panic = 1;
1650                         }
1651                 }
1652         }
1653
1654         if (panic)
1655                 rte_panic("Empty queues in slab 0x%" PRIx64 "starting at position %u\n",
1656                         bmp_slab, bmp_pos);
1657 }
1658
1659 #endif /* RTE_SCHED_DEBUG */
1660
1661 static inline struct rte_sched_subport *
1662 rte_sched_port_subport(struct rte_sched_port *port,
1663         struct rte_mbuf *pkt)
1664 {
1665         uint32_t queue_id = rte_mbuf_sched_queue_get(pkt);
1666         uint32_t subport_id = queue_id >> (port->n_pipes_per_subport_log2 + 4);
1667
1668         return port->subports[subport_id];
1669 }
1670
1671 static inline uint32_t
1672 rte_sched_port_enqueue_qptrs_prefetch0(struct rte_sched_subport *subport,
1673         struct rte_mbuf *pkt, uint32_t subport_qmask)
1674 {
1675         struct rte_sched_queue *q;
1676 #ifdef RTE_SCHED_COLLECT_STATS
1677         struct rte_sched_queue_extra *qe;
1678 #endif
1679         uint32_t qindex = rte_mbuf_sched_queue_get(pkt);
1680         uint32_t subport_queue_id = subport_qmask & qindex;
1681
1682         q = subport->queue + subport_queue_id;
1683         rte_prefetch0(q);
1684 #ifdef RTE_SCHED_COLLECT_STATS
1685         qe = subport->queue_extra + subport_queue_id;
1686         rte_prefetch0(qe);
1687 #endif
1688
1689         return subport_queue_id;
1690 }
1691
1692 static inline void
1693 rte_sched_port_enqueue_qwa_prefetch0(struct rte_sched_port *port,
1694         struct rte_sched_subport *subport,
1695         uint32_t qindex,
1696         struct rte_mbuf **qbase)
1697 {
1698         struct rte_sched_queue *q;
1699         struct rte_mbuf **q_qw;
1700         uint16_t qsize;
1701
1702         q = subport->queue + qindex;
1703         qsize = rte_sched_subport_pipe_qsize(port, subport, qindex);
1704         q_qw = qbase + (q->qw & (qsize - 1));
1705
1706         rte_prefetch0(q_qw);
1707         rte_bitmap_prefetch0(subport->bmp, qindex);
1708 }
1709
1710 static inline int
1711 rte_sched_port_enqueue_qwa(struct rte_sched_port *port,
1712         struct rte_sched_subport *subport,
1713         uint32_t qindex,
1714         struct rte_mbuf **qbase,
1715         struct rte_mbuf *pkt)
1716 {
1717         struct rte_sched_queue *q;
1718         uint16_t qsize;
1719         uint16_t qlen;
1720
1721         q = subport->queue + qindex;
1722         qsize = rte_sched_subport_pipe_qsize(port, subport, qindex);
1723         qlen = q->qw - q->qr;
1724
1725         /* Drop the packet (and update drop stats) when queue is full */
1726         if (unlikely(rte_sched_port_red_drop(port, subport, pkt, qindex, qlen) ||
1727                      (qlen >= qsize))) {
1728                 rte_pktmbuf_free(pkt);
1729 #ifdef RTE_SCHED_COLLECT_STATS
1730                 rte_sched_port_update_subport_stats_on_drop(port, subport,
1731                         qindex, pkt, qlen < qsize);
1732                 rte_sched_port_update_queue_stats_on_drop(subport, qindex, pkt,
1733                         qlen < qsize);
1734 #endif
1735                 return 0;
1736         }
1737
1738         /* Enqueue packet */
1739         qbase[q->qw & (qsize - 1)] = pkt;
1740         q->qw++;
1741
1742         /* Activate queue in the subport bitmap */
1743         rte_bitmap_set(subport->bmp, qindex);
1744
1745         /* Statistics */
1746 #ifdef RTE_SCHED_COLLECT_STATS
1747         rte_sched_port_update_subport_stats(port, subport, qindex, pkt);
1748         rte_sched_port_update_queue_stats(subport, qindex, pkt);
1749 #endif
1750
1751         return 1;
1752 }
1753
1754
1755 /*
1756  * The enqueue function implements a 4-level pipeline with each stage
1757  * processing two different packets. The purpose of using a pipeline
1758  * is to hide the latency of prefetching the data structures. The
1759  * naming convention is presented in the diagram below:
1760  *
1761  *   p00  _______   p10  _______   p20  _______   p30  _______
1762  * ----->|       |----->|       |----->|       |----->|       |----->
1763  *       |   0   |      |   1   |      |   2   |      |   3   |
1764  * ----->|_______|----->|_______|----->|_______|----->|_______|----->
1765  *   p01            p11            p21            p31
1766  *
1767  */
1768 int
1769 rte_sched_port_enqueue(struct rte_sched_port *port, struct rte_mbuf **pkts,
1770                        uint32_t n_pkts)
1771 {
1772         struct rte_mbuf *pkt00, *pkt01, *pkt10, *pkt11, *pkt20, *pkt21,
1773                 *pkt30, *pkt31, *pkt_last;
1774         struct rte_mbuf **q00_base, **q01_base, **q10_base, **q11_base,
1775                 **q20_base, **q21_base, **q30_base, **q31_base, **q_last_base;
1776         struct rte_sched_subport *subport00, *subport01, *subport10, *subport11,
1777                 *subport20, *subport21, *subport30, *subport31, *subport_last;
1778         uint32_t q00, q01, q10, q11, q20, q21, q30, q31, q_last;
1779         uint32_t r00, r01, r10, r11, r20, r21, r30, r31, r_last;
1780         uint32_t subport_qmask;
1781         uint32_t result, i;
1782
1783         result = 0;
1784         subport_qmask = (1 << (port->n_pipes_per_subport_log2 + 4)) - 1;
1785
1786         /*
1787          * Less then 6 input packets available, which is not enough to
1788          * feed the pipeline
1789          */
1790         if (unlikely(n_pkts < 6)) {
1791                 struct rte_sched_subport *subports[5];
1792                 struct rte_mbuf **q_base[5];
1793                 uint32_t q[5];
1794
1795                 /* Prefetch the mbuf structure of each packet */
1796                 for (i = 0; i < n_pkts; i++)
1797                         rte_prefetch0(pkts[i]);
1798
1799                 /* Prefetch the subport structure for each packet */
1800                 for (i = 0; i < n_pkts; i++)
1801                         subports[i] = rte_sched_port_subport(port, pkts[i]);
1802
1803                 /* Prefetch the queue structure for each queue */
1804                 for (i = 0; i < n_pkts; i++)
1805                         q[i] = rte_sched_port_enqueue_qptrs_prefetch0(subports[i],
1806                                         pkts[i], subport_qmask);
1807
1808                 /* Prefetch the write pointer location of each queue */
1809                 for (i = 0; i < n_pkts; i++) {
1810                         q_base[i] = rte_sched_subport_pipe_qbase(subports[i], q[i]);
1811                         rte_sched_port_enqueue_qwa_prefetch0(port, subports[i],
1812                                 q[i], q_base[i]);
1813                 }
1814
1815                 /* Write each packet to its queue */
1816                 for (i = 0; i < n_pkts; i++)
1817                         result += rte_sched_port_enqueue_qwa(port, subports[i],
1818                                                 q[i], q_base[i], pkts[i]);
1819
1820                 return result;
1821         }
1822
1823         /* Feed the first 3 stages of the pipeline (6 packets needed) */
1824         pkt20 = pkts[0];
1825         pkt21 = pkts[1];
1826         rte_prefetch0(pkt20);
1827         rte_prefetch0(pkt21);
1828
1829         pkt10 = pkts[2];
1830         pkt11 = pkts[3];
1831         rte_prefetch0(pkt10);
1832         rte_prefetch0(pkt11);
1833
1834         subport20 = rte_sched_port_subport(port, pkt20);
1835         subport21 = rte_sched_port_subport(port, pkt21);
1836         q20 = rte_sched_port_enqueue_qptrs_prefetch0(subport20,
1837                         pkt20, subport_qmask);
1838         q21 = rte_sched_port_enqueue_qptrs_prefetch0(subport21,
1839                         pkt21, subport_qmask);
1840
1841         pkt00 = pkts[4];
1842         pkt01 = pkts[5];
1843         rte_prefetch0(pkt00);
1844         rte_prefetch0(pkt01);
1845
1846         subport10 = rte_sched_port_subport(port, pkt10);
1847         subport11 = rte_sched_port_subport(port, pkt11);
1848         q10 = rte_sched_port_enqueue_qptrs_prefetch0(subport10,
1849                         pkt10, subport_qmask);
1850         q11 = rte_sched_port_enqueue_qptrs_prefetch0(subport11,
1851                         pkt11, subport_qmask);
1852
1853         q20_base = rte_sched_subport_pipe_qbase(subport20, q20);
1854         q21_base = rte_sched_subport_pipe_qbase(subport21, q21);
1855         rte_sched_port_enqueue_qwa_prefetch0(port, subport20, q20, q20_base);
1856         rte_sched_port_enqueue_qwa_prefetch0(port, subport21, q21, q21_base);
1857
1858         /* Run the pipeline */
1859         for (i = 6; i < (n_pkts & (~1)); i += 2) {
1860                 /* Propagate stage inputs */
1861                 pkt30 = pkt20;
1862                 pkt31 = pkt21;
1863                 pkt20 = pkt10;
1864                 pkt21 = pkt11;
1865                 pkt10 = pkt00;
1866                 pkt11 = pkt01;
1867                 q30 = q20;
1868                 q31 = q21;
1869                 q20 = q10;
1870                 q21 = q11;
1871                 subport30 = subport20;
1872                 subport31 = subport21;
1873                 subport20 = subport10;
1874                 subport21 = subport11;
1875                 q30_base = q20_base;
1876                 q31_base = q21_base;
1877
1878                 /* Stage 0: Get packets in */
1879                 pkt00 = pkts[i];
1880                 pkt01 = pkts[i + 1];
1881                 rte_prefetch0(pkt00);
1882                 rte_prefetch0(pkt01);
1883
1884                 /* Stage 1: Prefetch subport and queue structure storing queue pointers */
1885                 subport10 = rte_sched_port_subport(port, pkt10);
1886                 subport11 = rte_sched_port_subport(port, pkt11);
1887                 q10 = rte_sched_port_enqueue_qptrs_prefetch0(subport10,
1888                                 pkt10, subport_qmask);
1889                 q11 = rte_sched_port_enqueue_qptrs_prefetch0(subport11,
1890                                 pkt11, subport_qmask);
1891
1892                 /* Stage 2: Prefetch queue write location */
1893                 q20_base = rte_sched_subport_pipe_qbase(subport20, q20);
1894                 q21_base = rte_sched_subport_pipe_qbase(subport21, q21);
1895                 rte_sched_port_enqueue_qwa_prefetch0(port, subport20, q20, q20_base);
1896                 rte_sched_port_enqueue_qwa_prefetch0(port, subport21, q21, q21_base);
1897
1898                 /* Stage 3: Write packet to queue and activate queue */
1899                 r30 = rte_sched_port_enqueue_qwa(port, subport30,
1900                                 q30, q30_base, pkt30);
1901                 r31 = rte_sched_port_enqueue_qwa(port, subport31,
1902                                 q31, q31_base, pkt31);
1903                 result += r30 + r31;
1904         }
1905
1906         /*
1907          * Drain the pipeline (exactly 6 packets).
1908          * Handle the last packet in the case
1909          * of an odd number of input packets.
1910          */
1911         pkt_last = pkts[n_pkts - 1];
1912         rte_prefetch0(pkt_last);
1913
1914         subport00 = rte_sched_port_subport(port, pkt00);
1915         subport01 = rte_sched_port_subport(port, pkt01);
1916         q00 = rte_sched_port_enqueue_qptrs_prefetch0(subport00,
1917                         pkt00, subport_qmask);
1918         q01 = rte_sched_port_enqueue_qptrs_prefetch0(subport01,
1919                         pkt01, subport_qmask);
1920
1921         q10_base = rte_sched_subport_pipe_qbase(subport10, q10);
1922         q11_base = rte_sched_subport_pipe_qbase(subport11, q11);
1923         rte_sched_port_enqueue_qwa_prefetch0(port, subport10, q10, q10_base);
1924         rte_sched_port_enqueue_qwa_prefetch0(port, subport11, q11, q11_base);
1925
1926         r20 = rte_sched_port_enqueue_qwa(port, subport20,
1927                         q20, q20_base, pkt20);
1928         r21 = rte_sched_port_enqueue_qwa(port, subport21,
1929                         q21, q21_base, pkt21);
1930         result += r20 + r21;
1931
1932         subport_last = rte_sched_port_subport(port, pkt_last);
1933         q_last = rte_sched_port_enqueue_qptrs_prefetch0(subport_last,
1934                                 pkt_last, subport_qmask);
1935
1936         q00_base = rte_sched_subport_pipe_qbase(subport00, q00);
1937         q01_base = rte_sched_subport_pipe_qbase(subport01, q01);
1938         rte_sched_port_enqueue_qwa_prefetch0(port, subport00, q00, q00_base);
1939         rte_sched_port_enqueue_qwa_prefetch0(port, subport01, q01, q01_base);
1940
1941         r10 = rte_sched_port_enqueue_qwa(port, subport10, q10,
1942                         q10_base, pkt10);
1943         r11 = rte_sched_port_enqueue_qwa(port, subport11, q11,
1944                         q11_base, pkt11);
1945         result += r10 + r11;
1946
1947         q_last_base = rte_sched_subport_pipe_qbase(subport_last, q_last);
1948         rte_sched_port_enqueue_qwa_prefetch0(port, subport_last,
1949                 q_last, q_last_base);
1950
1951         r00 = rte_sched_port_enqueue_qwa(port, subport00, q00,
1952                         q00_base, pkt00);
1953         r01 = rte_sched_port_enqueue_qwa(port, subport01, q01,
1954                         q01_base, pkt01);
1955         result += r00 + r01;
1956
1957         if (n_pkts & 1) {
1958                 r_last = rte_sched_port_enqueue_qwa(port, subport_last,
1959                                         q_last, q_last_base, pkt_last);
1960                 result += r_last;
1961         }
1962
1963         return result;
1964 }
1965
1966 #ifndef RTE_SCHED_SUBPORT_TC_OV
1967
1968 static inline void
1969 grinder_credits_update(struct rte_sched_port *port,
1970         struct rte_sched_subport *subport, uint32_t pos)
1971 {
1972         struct rte_sched_grinder *grinder = subport->grinder + pos;
1973         struct rte_sched_pipe *pipe = grinder->pipe;
1974         struct rte_sched_pipe_profile *params = grinder->pipe_params;
1975         uint64_t n_periods;
1976         uint32_t i;
1977
1978         /* Subport TB */
1979         n_periods = (port->time - subport->tb_time) / subport->tb_period;
1980         subport->tb_credits += n_periods * subport->tb_credits_per_period;
1981         subport->tb_credits = RTE_MIN(subport->tb_credits, subport->tb_size);
1982         subport->tb_time += n_periods * subport->tb_period;
1983
1984         /* Pipe TB */
1985         n_periods = (port->time - pipe->tb_time) / params->tb_period;
1986         pipe->tb_credits += n_periods * params->tb_credits_per_period;
1987         pipe->tb_credits = RTE_MIN(pipe->tb_credits, params->tb_size);
1988         pipe->tb_time += n_periods * params->tb_period;
1989
1990         /* Subport TCs */
1991         if (unlikely(port->time >= subport->tc_time)) {
1992                 for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
1993                         subport->tc_credits[i] = subport->tc_credits_per_period[i];
1994
1995                 subport->tc_time = port->time + subport->tc_period;
1996         }
1997
1998         /* Pipe TCs */
1999         if (unlikely(port->time >= pipe->tc_time)) {
2000                 for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
2001                         pipe->tc_credits[i] = params->tc_credits_per_period[i];
2002
2003                 pipe->tc_time = port->time + params->tc_period;
2004         }
2005 }
2006
2007 #else
2008
2009 static inline uint64_t
2010 grinder_tc_ov_credits_update(struct rte_sched_port *port,
2011         struct rte_sched_subport *subport)
2012 {
2013         uint64_t tc_ov_consumption[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
2014         uint64_t tc_consumption = 0, tc_ov_consumption_max;
2015         uint64_t tc_ov_wm = subport->tc_ov_wm;
2016         uint32_t i;
2017
2018         if (subport->tc_ov == 0)
2019                 return subport->tc_ov_wm_max;
2020
2021         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASS_BE; i++) {
2022                 tc_ov_consumption[i] =
2023                         subport->tc_credits_per_period[i] - subport->tc_credits[i];
2024                 tc_consumption += tc_ov_consumption[i];
2025         }
2026
2027         tc_ov_consumption[RTE_SCHED_TRAFFIC_CLASS_BE] =
2028                 subport->tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASS_BE] -
2029                 subport->tc_credits[RTE_SCHED_TRAFFIC_CLASS_BE];
2030
2031         tc_ov_consumption_max =
2032                 subport->tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASS_BE] -
2033                         tc_consumption;
2034
2035         if (tc_ov_consumption[RTE_SCHED_TRAFFIC_CLASS_BE] >
2036                 (tc_ov_consumption_max - port->mtu)) {
2037                 tc_ov_wm  -= tc_ov_wm >> 7;
2038                 if (tc_ov_wm < subport->tc_ov_wm_min)
2039                         tc_ov_wm = subport->tc_ov_wm_min;
2040
2041                 return tc_ov_wm;
2042         }
2043
2044         tc_ov_wm += (tc_ov_wm >> 7) + 1;
2045         if (tc_ov_wm > subport->tc_ov_wm_max)
2046                 tc_ov_wm = subport->tc_ov_wm_max;
2047
2048         return tc_ov_wm;
2049 }
2050
2051 static inline void
2052 grinder_credits_update(struct rte_sched_port *port,
2053         struct rte_sched_subport *subport, uint32_t pos)
2054 {
2055         struct rte_sched_grinder *grinder = subport->grinder + pos;
2056         struct rte_sched_pipe *pipe = grinder->pipe;
2057         struct rte_sched_pipe_profile *params = grinder->pipe_params;
2058         uint64_t n_periods;
2059         uint32_t i;
2060
2061         /* Subport TB */
2062         n_periods = (port->time - subport->tb_time) / subport->tb_period;
2063         subport->tb_credits += n_periods * subport->tb_credits_per_period;
2064         subport->tb_credits = RTE_MIN(subport->tb_credits, subport->tb_size);
2065         subport->tb_time += n_periods * subport->tb_period;
2066
2067         /* Pipe TB */
2068         n_periods = (port->time - pipe->tb_time) / params->tb_period;
2069         pipe->tb_credits += n_periods * params->tb_credits_per_period;
2070         pipe->tb_credits = RTE_MIN(pipe->tb_credits, params->tb_size);
2071         pipe->tb_time += n_periods * params->tb_period;
2072
2073         /* Subport TCs */
2074         if (unlikely(port->time >= subport->tc_time)) {
2075                 subport->tc_ov_wm = grinder_tc_ov_credits_update(port, subport);
2076
2077                 for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
2078                         subport->tc_credits[i] = subport->tc_credits_per_period[i];
2079
2080                 subport->tc_time = port->time + subport->tc_period;
2081                 subport->tc_ov_period_id++;
2082         }
2083
2084         /* Pipe TCs */
2085         if (unlikely(port->time >= pipe->tc_time)) {
2086                 for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
2087                         pipe->tc_credits[i] = params->tc_credits_per_period[i];
2088                 pipe->tc_time = port->time + params->tc_period;
2089         }
2090
2091         /* Pipe TCs - Oversubscription */
2092         if (unlikely(pipe->tc_ov_period_id != subport->tc_ov_period_id)) {
2093                 pipe->tc_ov_credits = subport->tc_ov_wm * params->tc_ov_weight;
2094
2095                 pipe->tc_ov_period_id = subport->tc_ov_period_id;
2096         }
2097 }
2098
2099 #endif /* RTE_SCHED_TS_CREDITS_UPDATE, RTE_SCHED_SUBPORT_TC_OV */
2100
2101
2102 #ifndef RTE_SCHED_SUBPORT_TC_OV
2103
2104 static inline int
2105 grinder_credits_check(struct rte_sched_port *port,
2106         struct rte_sched_subport *subport, uint32_t pos)
2107 {
2108         struct rte_sched_grinder *grinder = subport->grinder + pos;
2109         struct rte_sched_pipe *pipe = grinder->pipe;
2110         struct rte_mbuf *pkt = grinder->pkt;
2111         uint32_t tc_index = grinder->tc_index;
2112         uint64_t pkt_len = pkt->pkt_len + port->frame_overhead;
2113         uint64_t subport_tb_credits = subport->tb_credits;
2114         uint64_t subport_tc_credits = subport->tc_credits[tc_index];
2115         uint64_t pipe_tb_credits = pipe->tb_credits;
2116         uint64_t pipe_tc_credits = pipe->tc_credits[tc_index];
2117         int enough_credits;
2118
2119         /* Check queue credits */
2120         enough_credits = (pkt_len <= subport_tb_credits) &&
2121                 (pkt_len <= subport_tc_credits) &&
2122                 (pkt_len <= pipe_tb_credits) &&
2123                 (pkt_len <= pipe_tc_credits);
2124
2125         if (!enough_credits)
2126                 return 0;
2127
2128         /* Update port credits */
2129         subport->tb_credits -= pkt_len;
2130         subport->tc_credits[tc_index] -= pkt_len;
2131         pipe->tb_credits -= pkt_len;
2132         pipe->tc_credits[tc_index] -= pkt_len;
2133
2134         return 1;
2135 }
2136
2137 #else
2138
2139 static inline int
2140 grinder_credits_check(struct rte_sched_port *port,
2141         struct rte_sched_subport *subport, uint32_t pos)
2142 {
2143         struct rte_sched_grinder *grinder = subport->grinder + pos;
2144         struct rte_sched_pipe *pipe = grinder->pipe;
2145         struct rte_mbuf *pkt = grinder->pkt;
2146         uint32_t tc_index = grinder->tc_index;
2147         uint64_t pkt_len = pkt->pkt_len + port->frame_overhead;
2148         uint64_t subport_tb_credits = subport->tb_credits;
2149         uint64_t subport_tc_credits = subport->tc_credits[tc_index];
2150         uint64_t pipe_tb_credits = pipe->tb_credits;
2151         uint64_t pipe_tc_credits = pipe->tc_credits[tc_index];
2152         uint64_t pipe_tc_ov_mask1[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
2153         uint64_t pipe_tc_ov_mask2[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE] = {0};
2154         uint64_t pipe_tc_ov_credits;
2155         uint32_t i;
2156         int enough_credits;
2157
2158         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
2159                 pipe_tc_ov_mask1[i] = ~0LLU;
2160
2161         pipe_tc_ov_mask1[RTE_SCHED_TRAFFIC_CLASS_BE] = pipe->tc_ov_credits;
2162         pipe_tc_ov_mask2[RTE_SCHED_TRAFFIC_CLASS_BE] = ~0LLU;
2163         pipe_tc_ov_credits = pipe_tc_ov_mask1[tc_index];
2164
2165         /* Check pipe and subport credits */
2166         enough_credits = (pkt_len <= subport_tb_credits) &&
2167                 (pkt_len <= subport_tc_credits) &&
2168                 (pkt_len <= pipe_tb_credits) &&
2169                 (pkt_len <= pipe_tc_credits) &&
2170                 (pkt_len <= pipe_tc_ov_credits);
2171
2172         if (!enough_credits)
2173                 return 0;
2174
2175         /* Update pipe and subport credits */
2176         subport->tb_credits -= pkt_len;
2177         subport->tc_credits[tc_index] -= pkt_len;
2178         pipe->tb_credits -= pkt_len;
2179         pipe->tc_credits[tc_index] -= pkt_len;
2180         pipe->tc_ov_credits -= pipe_tc_ov_mask2[tc_index] & pkt_len;
2181
2182         return 1;
2183 }
2184
2185 #endif /* RTE_SCHED_SUBPORT_TC_OV */
2186
2187
2188 static inline int
2189 grinder_schedule(struct rte_sched_port *port,
2190         struct rte_sched_subport *subport, uint32_t pos)
2191 {
2192         struct rte_sched_grinder *grinder = subport->grinder + pos;
2193         struct rte_sched_queue *queue = grinder->queue[grinder->qpos];
2194         struct rte_mbuf *pkt = grinder->pkt;
2195         uint32_t pkt_len = pkt->pkt_len + port->frame_overhead;
2196         uint32_t be_tc_active;
2197
2198         if (!grinder_credits_check(port, subport, pos))
2199                 return 0;
2200
2201         /* Advance port time */
2202         port->time += pkt_len;
2203
2204         /* Send packet */
2205         port->pkts_out[port->n_pkts_out++] = pkt;
2206         queue->qr++;
2207
2208         be_tc_active = (grinder->tc_index == RTE_SCHED_TRAFFIC_CLASS_BE) ? ~0x0 : 0x0;
2209         grinder->wrr_tokens[grinder->qpos] +=
2210                 (pkt_len * grinder->wrr_cost[grinder->qpos]) & be_tc_active;
2211
2212         if (queue->qr == queue->qw) {
2213                 uint32_t qindex = grinder->qindex[grinder->qpos];
2214
2215                 rte_bitmap_clear(subport->bmp, qindex);
2216                 grinder->qmask &= ~(1 << grinder->qpos);
2217                 if (be_tc_active)
2218                         grinder->wrr_mask[grinder->qpos] = 0;
2219                 rte_sched_port_set_queue_empty_timestamp(port, subport, qindex);
2220         }
2221
2222         /* Reset pipe loop detection */
2223         subport->pipe_loop = RTE_SCHED_PIPE_INVALID;
2224         grinder->productive = 1;
2225
2226         return 1;
2227 }
2228
2229 #ifdef SCHED_VECTOR_SSE4
2230
2231 static inline int
2232 grinder_pipe_exists(struct rte_sched_subport *subport, uint32_t base_pipe)
2233 {
2234         __m128i index = _mm_set1_epi32(base_pipe);
2235         __m128i pipes = _mm_load_si128((__m128i *)subport->grinder_base_bmp_pos);
2236         __m128i res = _mm_cmpeq_epi32(pipes, index);
2237
2238         pipes = _mm_load_si128((__m128i *)(subport->grinder_base_bmp_pos + 4));
2239         pipes = _mm_cmpeq_epi32(pipes, index);
2240         res = _mm_or_si128(res, pipes);
2241
2242         if (_mm_testz_si128(res, res))
2243                 return 0;
2244
2245         return 1;
2246 }
2247
2248 #elif defined(SCHED_VECTOR_NEON)
2249
2250 static inline int
2251 grinder_pipe_exists(struct rte_sched_subport *subport, uint32_t base_pipe)
2252 {
2253         uint32x4_t index, pipes;
2254         uint32_t *pos = (uint32_t *)subport->grinder_base_bmp_pos;
2255
2256         index = vmovq_n_u32(base_pipe);
2257         pipes = vld1q_u32(pos);
2258         if (!vminvq_u32(veorq_u32(pipes, index)))
2259                 return 1;
2260
2261         pipes = vld1q_u32(pos + 4);
2262         if (!vminvq_u32(veorq_u32(pipes, index)))
2263                 return 1;
2264
2265         return 0;
2266 }
2267
2268 #else
2269
2270 static inline int
2271 grinder_pipe_exists(struct rte_sched_subport *subport, uint32_t base_pipe)
2272 {
2273         uint32_t i;
2274
2275         for (i = 0; i < RTE_SCHED_PORT_N_GRINDERS; i++) {
2276                 if (subport->grinder_base_bmp_pos[i] == base_pipe)
2277                         return 1;
2278         }
2279
2280         return 0;
2281 }
2282
2283 #endif /* RTE_SCHED_OPTIMIZATIONS */
2284
2285 static inline void
2286 grinder_pcache_populate(struct rte_sched_subport *subport,
2287         uint32_t pos, uint32_t bmp_pos, uint64_t bmp_slab)
2288 {
2289         struct rte_sched_grinder *grinder = subport->grinder + pos;
2290         uint16_t w[4];
2291
2292         grinder->pcache_w = 0;
2293         grinder->pcache_r = 0;
2294
2295         w[0] = (uint16_t) bmp_slab;
2296         w[1] = (uint16_t) (bmp_slab >> 16);
2297         w[2] = (uint16_t) (bmp_slab >> 32);
2298         w[3] = (uint16_t) (bmp_slab >> 48);
2299
2300         grinder->pcache_qmask[grinder->pcache_w] = w[0];
2301         grinder->pcache_qindex[grinder->pcache_w] = bmp_pos;
2302         grinder->pcache_w += (w[0] != 0);
2303
2304         grinder->pcache_qmask[grinder->pcache_w] = w[1];
2305         grinder->pcache_qindex[grinder->pcache_w] = bmp_pos + 16;
2306         grinder->pcache_w += (w[1] != 0);
2307
2308         grinder->pcache_qmask[grinder->pcache_w] = w[2];
2309         grinder->pcache_qindex[grinder->pcache_w] = bmp_pos + 32;
2310         grinder->pcache_w += (w[2] != 0);
2311
2312         grinder->pcache_qmask[grinder->pcache_w] = w[3];
2313         grinder->pcache_qindex[grinder->pcache_w] = bmp_pos + 48;
2314         grinder->pcache_w += (w[3] != 0);
2315 }
2316
2317 static inline void
2318 grinder_tccache_populate(struct rte_sched_subport *subport,
2319         uint32_t pos, uint32_t qindex, uint16_t qmask)
2320 {
2321         struct rte_sched_grinder *grinder = subport->grinder + pos;
2322         uint8_t b, i;
2323
2324         grinder->tccache_w = 0;
2325         grinder->tccache_r = 0;
2326
2327         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASS_BE; i++) {
2328                 b = (uint8_t) ((qmask >> i) & 0x1);
2329                 grinder->tccache_qmask[grinder->tccache_w] = b;
2330                 grinder->tccache_qindex[grinder->tccache_w] = qindex + i;
2331                 grinder->tccache_w += (b != 0);
2332         }
2333
2334         b = (uint8_t) (qmask >> (RTE_SCHED_TRAFFIC_CLASS_BE));
2335         grinder->tccache_qmask[grinder->tccache_w] = b;
2336         grinder->tccache_qindex[grinder->tccache_w] = qindex +
2337                 RTE_SCHED_TRAFFIC_CLASS_BE;
2338         grinder->tccache_w += (b != 0);
2339 }
2340
2341 static inline int
2342 grinder_next_tc(struct rte_sched_port *port,
2343         struct rte_sched_subport *subport, uint32_t pos)
2344 {
2345         struct rte_sched_grinder *grinder = subport->grinder + pos;
2346         struct rte_mbuf **qbase;
2347         uint32_t qindex;
2348         uint16_t qsize;
2349
2350         if (grinder->tccache_r == grinder->tccache_w)
2351                 return 0;
2352
2353         qindex = grinder->tccache_qindex[grinder->tccache_r];
2354         qbase = rte_sched_subport_pipe_qbase(subport, qindex);
2355         qsize = rte_sched_subport_pipe_qsize(port, subport, qindex);
2356
2357         grinder->tc_index = rte_sched_port_pipe_tc(port, qindex);
2358         grinder->qmask = grinder->tccache_qmask[grinder->tccache_r];
2359         grinder->qsize = qsize;
2360
2361         if (grinder->tc_index < RTE_SCHED_TRAFFIC_CLASS_BE) {
2362                 grinder->queue[0] = subport->queue + qindex;
2363                 grinder->qbase[0] = qbase;
2364                 grinder->qindex[0] = qindex;
2365                 grinder->tccache_r++;
2366
2367                 return 1;
2368         }
2369
2370         grinder->queue[0] = subport->queue + qindex;
2371         grinder->queue[1] = subport->queue + qindex + 1;
2372         grinder->queue[2] = subport->queue + qindex + 2;
2373         grinder->queue[3] = subport->queue + qindex + 3;
2374
2375         grinder->qbase[0] = qbase;
2376         grinder->qbase[1] = qbase + qsize;
2377         grinder->qbase[2] = qbase + 2 * qsize;
2378         grinder->qbase[3] = qbase + 3 * qsize;
2379
2380         grinder->qindex[0] = qindex;
2381         grinder->qindex[1] = qindex + 1;
2382         grinder->qindex[2] = qindex + 2;
2383         grinder->qindex[3] = qindex + 3;
2384
2385         grinder->tccache_r++;
2386         return 1;
2387 }
2388
2389 static inline int
2390 grinder_next_pipe(struct rte_sched_port *port,
2391         struct rte_sched_subport *subport, uint32_t pos)
2392 {
2393         struct rte_sched_grinder *grinder = subport->grinder + pos;
2394         uint32_t pipe_qindex;
2395         uint16_t pipe_qmask;
2396
2397         if (grinder->pcache_r < grinder->pcache_w) {
2398                 pipe_qmask = grinder->pcache_qmask[grinder->pcache_r];
2399                 pipe_qindex = grinder->pcache_qindex[grinder->pcache_r];
2400                 grinder->pcache_r++;
2401         } else {
2402                 uint64_t bmp_slab = 0;
2403                 uint32_t bmp_pos = 0;
2404
2405                 /* Get another non-empty pipe group */
2406                 if (unlikely(rte_bitmap_scan(subport->bmp, &bmp_pos, &bmp_slab) <= 0))
2407                         return 0;
2408
2409 #ifdef RTE_SCHED_DEBUG
2410                 debug_check_queue_slab(subport, bmp_pos, bmp_slab);
2411 #endif
2412
2413                 /* Return if pipe group already in one of the other grinders */
2414                 subport->grinder_base_bmp_pos[pos] = RTE_SCHED_BMP_POS_INVALID;
2415                 if (unlikely(grinder_pipe_exists(subport, bmp_pos)))
2416                         return 0;
2417
2418                 subport->grinder_base_bmp_pos[pos] = bmp_pos;
2419
2420                 /* Install new pipe group into grinder's pipe cache */
2421                 grinder_pcache_populate(subport, pos, bmp_pos, bmp_slab);
2422
2423                 pipe_qmask = grinder->pcache_qmask[0];
2424                 pipe_qindex = grinder->pcache_qindex[0];
2425                 grinder->pcache_r = 1;
2426         }
2427
2428         /* Install new pipe in the grinder */
2429         grinder->pindex = pipe_qindex >> 4;
2430         grinder->subport = subport;
2431         grinder->pipe = subport->pipe + grinder->pindex;
2432         grinder->pipe_params = NULL; /* to be set after the pipe structure is prefetched */
2433         grinder->productive = 0;
2434
2435         grinder_tccache_populate(subport, pos, pipe_qindex, pipe_qmask);
2436         grinder_next_tc(port, subport, pos);
2437
2438         /* Check for pipe exhaustion */
2439         if (grinder->pindex == subport->pipe_loop) {
2440                 subport->pipe_exhaustion = 1;
2441                 subport->pipe_loop = RTE_SCHED_PIPE_INVALID;
2442         }
2443
2444         return 1;
2445 }
2446
2447
2448 static inline void
2449 grinder_wrr_load(struct rte_sched_subport *subport, uint32_t pos)
2450 {
2451         struct rte_sched_grinder *grinder = subport->grinder + pos;
2452         struct rte_sched_pipe *pipe = grinder->pipe;
2453         struct rte_sched_pipe_profile *pipe_params = grinder->pipe_params;
2454         uint32_t qmask = grinder->qmask;
2455
2456         grinder->wrr_tokens[0] =
2457                 ((uint16_t) pipe->wrr_tokens[0]) << RTE_SCHED_WRR_SHIFT;
2458         grinder->wrr_tokens[1] =
2459                 ((uint16_t) pipe->wrr_tokens[1]) << RTE_SCHED_WRR_SHIFT;
2460         grinder->wrr_tokens[2] =
2461                 ((uint16_t) pipe->wrr_tokens[2]) << RTE_SCHED_WRR_SHIFT;
2462         grinder->wrr_tokens[3] =
2463                 ((uint16_t) pipe->wrr_tokens[3]) << RTE_SCHED_WRR_SHIFT;
2464
2465         grinder->wrr_mask[0] = (qmask & 0x1) * 0xFFFF;
2466         grinder->wrr_mask[1] = ((qmask >> 1) & 0x1) * 0xFFFF;
2467         grinder->wrr_mask[2] = ((qmask >> 2) & 0x1) * 0xFFFF;
2468         grinder->wrr_mask[3] = ((qmask >> 3) & 0x1) * 0xFFFF;
2469
2470         grinder->wrr_cost[0] = pipe_params->wrr_cost[0];
2471         grinder->wrr_cost[1] = pipe_params->wrr_cost[1];
2472         grinder->wrr_cost[2] = pipe_params->wrr_cost[2];
2473         grinder->wrr_cost[3] = pipe_params->wrr_cost[3];
2474 }
2475
2476 static inline void
2477 grinder_wrr_store(struct rte_sched_subport *subport, uint32_t pos)
2478 {
2479         struct rte_sched_grinder *grinder = subport->grinder + pos;
2480         struct rte_sched_pipe *pipe = grinder->pipe;
2481
2482         pipe->wrr_tokens[0] =
2483                         (grinder->wrr_tokens[0] & grinder->wrr_mask[0]) >>
2484                                 RTE_SCHED_WRR_SHIFT;
2485         pipe->wrr_tokens[1] =
2486                         (grinder->wrr_tokens[1] & grinder->wrr_mask[1]) >>
2487                                 RTE_SCHED_WRR_SHIFT;
2488         pipe->wrr_tokens[2] =
2489                         (grinder->wrr_tokens[2] & grinder->wrr_mask[2]) >>
2490                                 RTE_SCHED_WRR_SHIFT;
2491         pipe->wrr_tokens[3] =
2492                         (grinder->wrr_tokens[3] & grinder->wrr_mask[3]) >>
2493                                 RTE_SCHED_WRR_SHIFT;
2494 }
2495
2496 static inline void
2497 grinder_wrr(struct rte_sched_subport *subport, uint32_t pos)
2498 {
2499         struct rte_sched_grinder *grinder = subport->grinder + pos;
2500         uint16_t wrr_tokens_min;
2501
2502         grinder->wrr_tokens[0] |= ~grinder->wrr_mask[0];
2503         grinder->wrr_tokens[1] |= ~grinder->wrr_mask[1];
2504         grinder->wrr_tokens[2] |= ~grinder->wrr_mask[2];
2505         grinder->wrr_tokens[3] |= ~grinder->wrr_mask[3];
2506
2507         grinder->qpos = rte_min_pos_4_u16(grinder->wrr_tokens);
2508         wrr_tokens_min = grinder->wrr_tokens[grinder->qpos];
2509
2510         grinder->wrr_tokens[0] -= wrr_tokens_min;
2511         grinder->wrr_tokens[1] -= wrr_tokens_min;
2512         grinder->wrr_tokens[2] -= wrr_tokens_min;
2513         grinder->wrr_tokens[3] -= wrr_tokens_min;
2514 }
2515
2516
2517 #define grinder_evict(subport, pos)
2518
2519 static inline void
2520 grinder_prefetch_pipe(struct rte_sched_subport *subport, uint32_t pos)
2521 {
2522         struct rte_sched_grinder *grinder = subport->grinder + pos;
2523
2524         rte_prefetch0(grinder->pipe);
2525         rte_prefetch0(grinder->queue[0]);
2526 }
2527
2528 static inline void
2529 grinder_prefetch_tc_queue_arrays(struct rte_sched_subport *subport, uint32_t pos)
2530 {
2531         struct rte_sched_grinder *grinder = subport->grinder + pos;
2532         uint16_t qsize, qr[RTE_SCHED_MAX_QUEUES_PER_TC];
2533
2534         qsize = grinder->qsize;
2535         grinder->qpos = 0;
2536
2537         if (grinder->tc_index < RTE_SCHED_TRAFFIC_CLASS_BE) {
2538                 qr[0] = grinder->queue[0]->qr & (qsize - 1);
2539
2540                 rte_prefetch0(grinder->qbase[0] + qr[0]);
2541                 return;
2542         }
2543
2544         qr[0] = grinder->queue[0]->qr & (qsize - 1);
2545         qr[1] = grinder->queue[1]->qr & (qsize - 1);
2546         qr[2] = grinder->queue[2]->qr & (qsize - 1);
2547         qr[3] = grinder->queue[3]->qr & (qsize - 1);
2548
2549         rte_prefetch0(grinder->qbase[0] + qr[0]);
2550         rte_prefetch0(grinder->qbase[1] + qr[1]);
2551
2552         grinder_wrr_load(subport, pos);
2553         grinder_wrr(subport, pos);
2554
2555         rte_prefetch0(grinder->qbase[2] + qr[2]);
2556         rte_prefetch0(grinder->qbase[3] + qr[3]);
2557 }
2558
2559 static inline void
2560 grinder_prefetch_mbuf(struct rte_sched_subport *subport, uint32_t pos)
2561 {
2562         struct rte_sched_grinder *grinder = subport->grinder + pos;
2563         uint32_t qpos = grinder->qpos;
2564         struct rte_mbuf **qbase = grinder->qbase[qpos];
2565         uint16_t qsize = grinder->qsize;
2566         uint16_t qr = grinder->queue[qpos]->qr & (qsize - 1);
2567
2568         grinder->pkt = qbase[qr];
2569         rte_prefetch0(grinder->pkt);
2570
2571         if (unlikely((qr & 0x7) == 7)) {
2572                 uint16_t qr_next = (grinder->queue[qpos]->qr + 1) & (qsize - 1);
2573
2574                 rte_prefetch0(qbase + qr_next);
2575         }
2576 }
2577
2578 static inline uint32_t
2579 grinder_handle(struct rte_sched_port *port,
2580         struct rte_sched_subport *subport, uint32_t pos)
2581 {
2582         struct rte_sched_grinder *grinder = subport->grinder + pos;
2583
2584         switch (grinder->state) {
2585         case e_GRINDER_PREFETCH_PIPE:
2586         {
2587                 if (grinder_next_pipe(port, subport, pos)) {
2588                         grinder_prefetch_pipe(subport, pos);
2589                         subport->busy_grinders++;
2590
2591                         grinder->state = e_GRINDER_PREFETCH_TC_QUEUE_ARRAYS;
2592                         return 0;
2593                 }
2594
2595                 return 0;
2596         }
2597
2598         case e_GRINDER_PREFETCH_TC_QUEUE_ARRAYS:
2599         {
2600                 struct rte_sched_pipe *pipe = grinder->pipe;
2601
2602                 grinder->pipe_params = subport->pipe_profiles + pipe->profile;
2603                 grinder_prefetch_tc_queue_arrays(subport, pos);
2604                 grinder_credits_update(port, subport, pos);
2605
2606                 grinder->state = e_GRINDER_PREFETCH_MBUF;
2607                 return 0;
2608         }
2609
2610         case e_GRINDER_PREFETCH_MBUF:
2611         {
2612                 grinder_prefetch_mbuf(subport, pos);
2613
2614                 grinder->state = e_GRINDER_READ_MBUF;
2615                 return 0;
2616         }
2617
2618         case e_GRINDER_READ_MBUF:
2619         {
2620                 uint32_t wrr_active, result = 0;
2621
2622                 result = grinder_schedule(port, subport, pos);
2623
2624                 wrr_active = (grinder->tc_index == RTE_SCHED_TRAFFIC_CLASS_BE);
2625
2626                 /* Look for next packet within the same TC */
2627                 if (result && grinder->qmask) {
2628                         if (wrr_active)
2629                                 grinder_wrr(subport, pos);
2630
2631                         grinder_prefetch_mbuf(subport, pos);
2632
2633                         return 1;
2634                 }
2635
2636                 if (wrr_active)
2637                         grinder_wrr_store(subport, pos);
2638
2639                 /* Look for another active TC within same pipe */
2640                 if (grinder_next_tc(port, subport, pos)) {
2641                         grinder_prefetch_tc_queue_arrays(subport, pos);
2642
2643                         grinder->state = e_GRINDER_PREFETCH_MBUF;
2644                         return result;
2645                 }
2646
2647                 if (grinder->productive == 0 &&
2648                     subport->pipe_loop == RTE_SCHED_PIPE_INVALID)
2649                         subport->pipe_loop = grinder->pindex;
2650
2651                 grinder_evict(subport, pos);
2652
2653                 /* Look for another active pipe */
2654                 if (grinder_next_pipe(port, subport, pos)) {
2655                         grinder_prefetch_pipe(subport, pos);
2656
2657                         grinder->state = e_GRINDER_PREFETCH_TC_QUEUE_ARRAYS;
2658                         return result;
2659                 }
2660
2661                 /* No active pipe found */
2662                 subport->busy_grinders--;
2663
2664                 grinder->state = e_GRINDER_PREFETCH_PIPE;
2665                 return result;
2666         }
2667
2668         default:
2669                 rte_panic("Algorithmic error (invalid state)\n");
2670                 return 0;
2671         }
2672 }
2673
2674 static inline void
2675 rte_sched_port_time_resync(struct rte_sched_port *port)
2676 {
2677         uint64_t cycles = rte_get_tsc_cycles();
2678         uint64_t cycles_diff;
2679         uint64_t bytes_diff;
2680         uint32_t i;
2681
2682         if (cycles < port->time_cpu_cycles)
2683                 port->time_cpu_cycles = 0;
2684
2685         cycles_diff = cycles - port->time_cpu_cycles;
2686         /* Compute elapsed time in bytes */
2687         bytes_diff = rte_reciprocal_divide(cycles_diff << RTE_SCHED_TIME_SHIFT,
2688                                            port->inv_cycles_per_byte);
2689
2690         /* Advance port time */
2691         port->time_cpu_cycles +=
2692                 (bytes_diff * port->cycles_per_byte) >> RTE_SCHED_TIME_SHIFT;
2693         port->time_cpu_bytes += bytes_diff;
2694         if (port->time < port->time_cpu_bytes)
2695                 port->time = port->time_cpu_bytes;
2696
2697         /* Reset pipe loop detection */
2698         for (i = 0; i < port->n_subports_per_port; i++)
2699                 port->subports[i]->pipe_loop = RTE_SCHED_PIPE_INVALID;
2700 }
2701
2702 static inline int
2703 rte_sched_port_exceptions(struct rte_sched_subport *subport, int second_pass)
2704 {
2705         int exceptions;
2706
2707         /* Check if any exception flag is set */
2708         exceptions = (second_pass && subport->busy_grinders == 0) ||
2709                 (subport->pipe_exhaustion == 1);
2710
2711         /* Clear exception flags */
2712         subport->pipe_exhaustion = 0;
2713
2714         return exceptions;
2715 }
2716
2717 int
2718 rte_sched_port_dequeue(struct rte_sched_port *port, struct rte_mbuf **pkts, uint32_t n_pkts)
2719 {
2720         struct rte_sched_subport *subport;
2721         uint32_t subport_id = port->subport_id;
2722         uint32_t i, n_subports = 0, count;
2723
2724         port->pkts_out = pkts;
2725         port->n_pkts_out = 0;
2726
2727         rte_sched_port_time_resync(port);
2728
2729         /* Take each queue in the grinder one step further */
2730         for (i = 0, count = 0; ; i++)  {
2731                 subport = port->subports[subport_id];
2732
2733                 count += grinder_handle(port, subport,
2734                                 i & (RTE_SCHED_PORT_N_GRINDERS - 1));
2735
2736                 if (count == n_pkts) {
2737                         subport_id++;
2738
2739                         if (subport_id == port->n_subports_per_port)
2740                                 subport_id = 0;
2741
2742                         port->subport_id = subport_id;
2743                         break;
2744                 }
2745
2746                 if (rte_sched_port_exceptions(subport, i >= RTE_SCHED_PORT_N_GRINDERS)) {
2747                         i = 0;
2748                         subport_id++;
2749                         n_subports++;
2750                 }
2751
2752                 if (subport_id == port->n_subports_per_port)
2753                         subport_id = 0;
2754
2755                 if (n_subports == port->n_subports_per_port) {
2756                         port->subport_id = subport_id;
2757                         break;
2758                 }
2759         }
2760
2761         return count;
2762 }