sched: update grinder functions for config flexibility
[dpdk.git] / lib / librte_sched / rte_sched.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4
5 #include <stdio.h>
6 #include <string.h>
7
8 #include <rte_common.h>
9 #include <rte_log.h>
10 #include <rte_memory.h>
11 #include <rte_malloc.h>
12 #include <rte_cycles.h>
13 #include <rte_prefetch.h>
14 #include <rte_branch_prediction.h>
15 #include <rte_mbuf.h>
16 #include <rte_bitmap.h>
17 #include <rte_reciprocal.h>
18
19 #include "rte_sched.h"
20 #include "rte_sched_common.h"
21 #include "rte_approx.h"
22
23 #ifdef __INTEL_COMPILER
24 #pragma warning(disable:2259) /* conversion may lose significant bits */
25 #endif
26
27 #ifdef RTE_SCHED_VECTOR
28 #include <rte_vect.h>
29
30 #ifdef RTE_ARCH_X86
31 #define SCHED_VECTOR_SSE4
32 #elif defined(RTE_MACHINE_CPUFLAG_NEON)
33 #define SCHED_VECTOR_NEON
34 #endif
35
36 #endif
37
38 #define RTE_SCHED_TB_RATE_CONFIG_ERR          (1e-7)
39 #define RTE_SCHED_WRR_SHIFT                   3
40 #define RTE_SCHED_MAX_QUEUES_PER_TC           RTE_SCHED_BE_QUEUES_PER_PIPE
41 #define RTE_SCHED_GRINDER_PCACHE_SIZE         (64 / RTE_SCHED_QUEUES_PER_PIPE)
42 #define RTE_SCHED_PIPE_INVALID                UINT32_MAX
43 #define RTE_SCHED_BMP_POS_INVALID             UINT32_MAX
44
45 /* Scaling for cycles_per_byte calculation
46  * Chosen so that minimum rate is 480 bit/sec
47  */
48 #define RTE_SCHED_TIME_SHIFT                  8
49
50 struct rte_sched_pipe_profile {
51         /* Token bucket (TB) */
52         uint32_t tb_period;
53         uint32_t tb_credits_per_period;
54         uint32_t tb_size;
55
56         /* Pipe traffic classes */
57         uint32_t tc_period;
58         uint32_t tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
59         uint8_t tc_ov_weight;
60
61         /* Pipe best-effort traffic class queues */
62         uint8_t  wrr_cost[RTE_SCHED_BE_QUEUES_PER_PIPE];
63 };
64
65 struct rte_sched_pipe {
66         /* Token bucket (TB) */
67         uint64_t tb_time; /* time of last update */
68         uint32_t tb_credits;
69
70         /* Pipe profile and flags */
71         uint32_t profile;
72
73         /* Traffic classes (TCs) */
74         uint64_t tc_time; /* time of next update */
75         uint32_t tc_credits[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
76
77         /* Weighted Round Robin (WRR) */
78         uint8_t wrr_tokens[RTE_SCHED_BE_QUEUES_PER_PIPE];
79
80         /* TC oversubscription */
81         uint32_t tc_ov_credits;
82         uint8_t tc_ov_period_id;
83 } __rte_cache_aligned;
84
85 struct rte_sched_queue {
86         uint16_t qw;
87         uint16_t qr;
88 };
89
90 struct rte_sched_queue_extra {
91         struct rte_sched_queue_stats stats;
92 #ifdef RTE_SCHED_RED
93         struct rte_red red;
94 #endif
95 };
96
97 enum grinder_state {
98         e_GRINDER_PREFETCH_PIPE = 0,
99         e_GRINDER_PREFETCH_TC_QUEUE_ARRAYS,
100         e_GRINDER_PREFETCH_MBUF,
101         e_GRINDER_READ_MBUF
102 };
103
104 struct rte_sched_grinder {
105         /* Pipe cache */
106         uint16_t pcache_qmask[RTE_SCHED_GRINDER_PCACHE_SIZE];
107         uint32_t pcache_qindex[RTE_SCHED_GRINDER_PCACHE_SIZE];
108         uint32_t pcache_w;
109         uint32_t pcache_r;
110
111         /* Current pipe */
112         enum grinder_state state;
113         uint32_t productive;
114         uint32_t pindex;
115         struct rte_sched_subport *subport;
116         struct rte_sched_pipe *pipe;
117         struct rte_sched_pipe_profile *pipe_params;
118
119         /* TC cache */
120         uint8_t tccache_qmask[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
121         uint32_t tccache_qindex[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
122         uint32_t tccache_w;
123         uint32_t tccache_r;
124
125         /* Current TC */
126         uint32_t tc_index;
127         struct rte_sched_queue *queue[RTE_SCHED_MAX_QUEUES_PER_TC];
128         struct rte_mbuf **qbase[RTE_SCHED_MAX_QUEUES_PER_TC];
129         uint32_t qindex[RTE_SCHED_MAX_QUEUES_PER_TC];
130         uint16_t qsize;
131         uint32_t qmask;
132         uint32_t qpos;
133         struct rte_mbuf *pkt;
134
135         /* WRR */
136         uint16_t wrr_tokens[RTE_SCHED_BE_QUEUES_PER_PIPE];
137         uint16_t wrr_mask[RTE_SCHED_BE_QUEUES_PER_PIPE];
138         uint8_t wrr_cost[RTE_SCHED_BE_QUEUES_PER_PIPE];
139 };
140
141 struct rte_sched_subport {
142         /* Token bucket (TB) */
143         uint64_t tb_time; /* time of last update */
144         uint32_t tb_period;
145         uint32_t tb_credits_per_period;
146         uint32_t tb_size;
147         uint32_t tb_credits;
148
149         /* Traffic classes (TCs) */
150         uint64_t tc_time; /* time of next update */
151         uint32_t tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
152         uint32_t tc_credits[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
153         uint32_t tc_period;
154
155         /* TC oversubscription */
156         uint32_t tc_ov_wm;
157         uint32_t tc_ov_wm_min;
158         uint32_t tc_ov_wm_max;
159         uint8_t tc_ov_period_id;
160         uint8_t tc_ov;
161         uint32_t tc_ov_n;
162         double tc_ov_rate;
163
164         /* Statistics */
165         struct rte_sched_subport_stats stats;
166
167         /* Subport pipes */
168         uint32_t n_pipes_per_subport_enabled;
169         uint32_t n_pipe_profiles;
170         uint32_t n_max_pipe_profiles;
171
172         /* Pipe best-effort TC rate */
173         uint32_t pipe_tc_be_rate_max;
174
175         /* Pipe queues size */
176         uint16_t qsize[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
177
178 #ifdef RTE_SCHED_RED
179         struct rte_red_config red_config[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE][RTE_COLORS];
180 #endif
181
182         /* Scheduling loop detection */
183         uint32_t pipe_loop;
184         uint32_t pipe_exhaustion;
185
186         /* Bitmap */
187         struct rte_bitmap *bmp;
188         uint32_t grinder_base_bmp_pos[RTE_SCHED_PORT_N_GRINDERS] __rte_aligned_16;
189
190         /* Grinders */
191         struct rte_sched_grinder grinder[RTE_SCHED_PORT_N_GRINDERS];
192         uint32_t busy_grinders;
193
194         /* Queue base calculation */
195         uint32_t qsize_add[RTE_SCHED_QUEUES_PER_PIPE];
196         uint32_t qsize_sum;
197
198         struct rte_sched_pipe *pipe;
199         struct rte_sched_queue *queue;
200         struct rte_sched_queue_extra *queue_extra;
201         struct rte_sched_pipe_profile *pipe_profiles;
202         uint8_t *bmp_array;
203         struct rte_mbuf **queue_array;
204         uint8_t memory[0] __rte_cache_aligned;
205 } __rte_cache_aligned;
206
207 struct rte_sched_port {
208         /* User parameters */
209         uint32_t n_subports_per_port;
210         uint32_t n_pipes_per_subport;
211         uint32_t n_pipes_per_subport_log2;
212         uint16_t pipe_queue[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
213         uint8_t pipe_tc[RTE_SCHED_QUEUES_PER_PIPE];
214         uint8_t tc_queue[RTE_SCHED_QUEUES_PER_PIPE];
215         uint32_t rate;
216         uint32_t mtu;
217         uint32_t frame_overhead;
218         int socket;
219         uint16_t qsize[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
220         uint32_t n_pipe_profiles;
221         uint32_t n_max_pipe_profiles;
222         uint32_t pipe_tc_be_rate_max;
223 #ifdef RTE_SCHED_RED
224         struct rte_red_config red_config[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE][RTE_COLORS];
225 #endif
226
227         /* Timing */
228         uint64_t time_cpu_cycles;     /* Current CPU time measured in CPU cyles */
229         uint64_t time_cpu_bytes;      /* Current CPU time measured in bytes */
230         uint64_t time;                /* Current NIC TX time measured in bytes */
231         struct rte_reciprocal inv_cycles_per_byte; /* CPU cycles per byte */
232
233         /* Scheduling loop detection */
234         uint32_t pipe_loop;
235         uint32_t pipe_exhaustion;
236
237         /* Bitmap */
238         struct rte_bitmap *bmp;
239         uint32_t grinder_base_bmp_pos[RTE_SCHED_PORT_N_GRINDERS] __rte_aligned_16;
240
241         /* Grinders */
242         struct rte_sched_grinder grinder[RTE_SCHED_PORT_N_GRINDERS];
243         uint32_t busy_grinders;
244         struct rte_mbuf **pkts_out;
245         uint32_t n_pkts_out;
246
247         /* Queue base calculation */
248         uint32_t qsize_add[RTE_SCHED_QUEUES_PER_PIPE];
249         uint32_t qsize_sum;
250
251         /* Large data structures */
252         struct rte_sched_subport *subports[0];
253         struct rte_sched_subport *subport;
254         struct rte_sched_pipe *pipe;
255         struct rte_sched_queue *queue;
256         struct rte_sched_queue_extra *queue_extra;
257         struct rte_sched_pipe_profile *pipe_profiles;
258         uint8_t *bmp_array;
259         struct rte_mbuf **queue_array;
260         uint8_t memory[0] __rte_cache_aligned;
261 } __rte_cache_aligned;
262
263 enum rte_sched_port_array {
264         e_RTE_SCHED_PORT_ARRAY_SUBPORT = 0,
265         e_RTE_SCHED_PORT_ARRAY_PIPE,
266         e_RTE_SCHED_PORT_ARRAY_QUEUE,
267         e_RTE_SCHED_PORT_ARRAY_QUEUE_EXTRA,
268         e_RTE_SCHED_PORT_ARRAY_PIPE_PROFILES,
269         e_RTE_SCHED_PORT_ARRAY_BMP_ARRAY,
270         e_RTE_SCHED_PORT_ARRAY_QUEUE_ARRAY,
271         e_RTE_SCHED_PORT_ARRAY_TOTAL,
272 };
273
274 enum rte_sched_subport_array {
275         e_RTE_SCHED_SUBPORT_ARRAY_PIPE = 0,
276         e_RTE_SCHED_SUBPORT_ARRAY_QUEUE,
277         e_RTE_SCHED_SUBPORT_ARRAY_QUEUE_EXTRA,
278         e_RTE_SCHED_SUBPORT_ARRAY_PIPE_PROFILES,
279         e_RTE_SCHED_SUBPORT_ARRAY_BMP_ARRAY,
280         e_RTE_SCHED_SUBPORT_ARRAY_QUEUE_ARRAY,
281         e_RTE_SCHED_SUBPORT_ARRAY_TOTAL,
282 };
283
284 static inline uint32_t
285 rte_sched_subport_pipe_queues(struct rte_sched_subport *subport)
286 {
287         return RTE_SCHED_QUEUES_PER_PIPE * subport->n_pipes_per_subport_enabled;
288 }
289
290 static inline struct rte_mbuf **
291 rte_sched_subport_pipe_qbase(struct rte_sched_subport *subport, uint32_t qindex)
292 {
293         uint32_t pindex = qindex >> 4;
294         uint32_t qpos = qindex & (RTE_SCHED_QUEUES_PER_PIPE - 1);
295
296         return (subport->queue_array + pindex *
297                 subport->qsize_sum + subport->qsize_add[qpos]);
298 }
299
300 static inline uint16_t
301 rte_sched_subport_pipe_qsize(struct rte_sched_port *port,
302 struct rte_sched_subport *subport, uint32_t qindex)
303 {
304         uint32_t tc = port->pipe_tc[qindex & (RTE_SCHED_QUEUES_PER_PIPE - 1)];
305
306         return subport->qsize[tc];
307 }
308
309 static inline uint32_t
310 rte_sched_port_queues_per_port(struct rte_sched_port *port)
311 {
312         return RTE_SCHED_QUEUES_PER_PIPE * port->n_pipes_per_subport * port->n_subports_per_port;
313 }
314
315 static inline uint16_t
316 rte_sched_port_pipe_queue(struct rte_sched_port *port, uint32_t traffic_class)
317 {
318         uint16_t pipe_queue = port->pipe_queue[traffic_class];
319
320         return pipe_queue;
321 }
322
323 static inline uint8_t
324 rte_sched_port_pipe_tc(struct rte_sched_port *port, uint32_t qindex)
325 {
326         uint8_t pipe_tc = port->pipe_tc[qindex & (RTE_SCHED_QUEUES_PER_PIPE - 1)];
327
328         return pipe_tc;
329 }
330
331 static inline uint8_t
332 rte_sched_port_tc_queue(struct rte_sched_port *port, uint32_t qindex)
333 {
334         uint8_t tc_queue = port->tc_queue[qindex & (RTE_SCHED_QUEUES_PER_PIPE - 1)];
335
336         return tc_queue;
337 }
338
339 static int
340 pipe_profile_check(struct rte_sched_pipe_params *params,
341         uint32_t rate, uint16_t *qsize)
342 {
343         uint32_t i;
344
345         /* Pipe parameters */
346         if (params == NULL) {
347                 RTE_LOG(ERR, SCHED,
348                         "%s: Incorrect value for parameter params\n", __func__);
349                 return -EINVAL;
350         }
351
352         /* TB rate: non-zero, not greater than port rate */
353         if (params->tb_rate == 0 ||
354                 params->tb_rate > rate) {
355                 RTE_LOG(ERR, SCHED,
356                         "%s: Incorrect value for tb rate\n", __func__);
357                 return -EINVAL;
358         }
359
360         /* TB size: non-zero */
361         if (params->tb_size == 0) {
362                 RTE_LOG(ERR, SCHED,
363                         "%s: Incorrect value for tb size\n", __func__);
364                 return -EINVAL;
365         }
366
367         /* TC rate: non-zero if qsize non-zero, less than pipe rate */
368         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) {
369                 if ((qsize[i] == 0 && params->tc_rate[i] != 0) ||
370                         (qsize[i] != 0 && (params->tc_rate[i] == 0 ||
371                         params->tc_rate[i] > params->tb_rate))) {
372                         RTE_LOG(ERR, SCHED,
373                                 "%s: Incorrect value for qsize or tc_rate\n", __func__);
374                         return -EINVAL;
375                 }
376         }
377
378         if (params->tc_rate[RTE_SCHED_TRAFFIC_CLASS_BE] == 0 ||
379                 qsize[RTE_SCHED_TRAFFIC_CLASS_BE] == 0) {
380                 RTE_LOG(ERR, SCHED,
381                         "%s: Incorrect value for be traffic class rate\n", __func__);
382                 return -EINVAL;
383         }
384
385         /* TC period: non-zero */
386         if (params->tc_period == 0) {
387                 RTE_LOG(ERR, SCHED,
388                         "%s: Incorrect value for tc period\n", __func__);
389                 return -EINVAL;
390         }
391
392         /*  Best effort tc oversubscription weight: non-zero */
393         if (params->tc_ov_weight == 0) {
394                 RTE_LOG(ERR, SCHED,
395                         "%s: Incorrect value for tc ov weight\n", __func__);
396                 return -EINVAL;
397         }
398
399         /* Queue WRR weights: non-zero */
400         for (i = 0; i < RTE_SCHED_BE_QUEUES_PER_PIPE; i++) {
401                 if (params->wrr_weights[i] == 0) {
402                         RTE_LOG(ERR, SCHED,
403                                 "%s: Incorrect value for wrr weight\n", __func__);
404                         return -EINVAL;
405                 }
406         }
407
408         return 0;
409 }
410
411 static int
412 rte_sched_port_check_params(struct rte_sched_port_params *params)
413 {
414         if (params == NULL) {
415                 RTE_LOG(ERR, SCHED,
416                         "%s: Incorrect value for parameter params\n", __func__);
417                 return -EINVAL;
418         }
419
420         /* socket */
421         if (params->socket < 0) {
422                 RTE_LOG(ERR, SCHED,
423                         "%s: Incorrect value for socket id\n", __func__);
424                 return -EINVAL;
425         }
426
427         /* rate */
428         if (params->rate == 0) {
429                 RTE_LOG(ERR, SCHED,
430                         "%s: Incorrect value for rate\n", __func__);
431                 return -EINVAL;
432         }
433
434         /* mtu */
435         if (params->mtu == 0) {
436                 RTE_LOG(ERR, SCHED,
437                         "%s: Incorrect value for mtu\n", __func__);
438                 return -EINVAL;
439         }
440
441         /* n_subports_per_port: non-zero, limited to 16 bits, power of 2 */
442         if (params->n_subports_per_port == 0 ||
443             params->n_subports_per_port > 1u << 16 ||
444             !rte_is_power_of_2(params->n_subports_per_port)) {
445                 RTE_LOG(ERR, SCHED,
446                         "%s: Incorrect value for number of subports\n", __func__);
447                 return -EINVAL;
448         }
449
450         /* n_pipes_per_subport: non-zero, power of 2 */
451         if (params->n_pipes_per_subport == 0 ||
452             !rte_is_power_of_2(params->n_pipes_per_subport)) {
453                 RTE_LOG(ERR, SCHED,
454                         "%s: Incorrect value for maximum pipes number\n", __func__);
455                 return -EINVAL;
456         }
457
458         return 0;
459 }
460
461 static uint32_t
462 rte_sched_subport_get_array_base(struct rte_sched_subport_params *params,
463         enum rte_sched_subport_array array)
464 {
465         uint32_t n_pipes_per_subport = params->n_pipes_per_subport_enabled;
466         uint32_t n_subport_pipe_queues =
467                 RTE_SCHED_QUEUES_PER_PIPE * n_pipes_per_subport;
468
469         uint32_t size_pipe = n_pipes_per_subport * sizeof(struct rte_sched_pipe);
470         uint32_t size_queue =
471                 n_subport_pipe_queues * sizeof(struct rte_sched_queue);
472         uint32_t size_queue_extra
473                 = n_subport_pipe_queues * sizeof(struct rte_sched_queue_extra);
474         uint32_t size_pipe_profiles = params->n_max_pipe_profiles *
475                 sizeof(struct rte_sched_pipe_profile);
476         uint32_t size_bmp_array =
477                 rte_bitmap_get_memory_footprint(n_subport_pipe_queues);
478         uint32_t size_per_pipe_queue_array, size_queue_array;
479
480         uint32_t base, i;
481
482         size_per_pipe_queue_array = 0;
483         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) {
484                 if (i < RTE_SCHED_TRAFFIC_CLASS_BE)
485                         size_per_pipe_queue_array +=
486                                 params->qsize[i] * sizeof(struct rte_mbuf *);
487                 else
488                         size_per_pipe_queue_array += RTE_SCHED_MAX_QUEUES_PER_TC *
489                                 params->qsize[i] * sizeof(struct rte_mbuf *);
490         }
491         size_queue_array = n_pipes_per_subport * size_per_pipe_queue_array;
492
493         base = 0;
494
495         if (array == e_RTE_SCHED_SUBPORT_ARRAY_PIPE)
496                 return base;
497         base += RTE_CACHE_LINE_ROUNDUP(size_pipe);
498
499         if (array == e_RTE_SCHED_SUBPORT_ARRAY_QUEUE)
500                 return base;
501         base += RTE_CACHE_LINE_ROUNDUP(size_queue);
502
503         if (array == e_RTE_SCHED_SUBPORT_ARRAY_QUEUE_EXTRA)
504                 return base;
505         base += RTE_CACHE_LINE_ROUNDUP(size_queue_extra);
506
507         if (array == e_RTE_SCHED_SUBPORT_ARRAY_PIPE_PROFILES)
508                 return base;
509         base += RTE_CACHE_LINE_ROUNDUP(size_pipe_profiles);
510
511         if (array == e_RTE_SCHED_SUBPORT_ARRAY_BMP_ARRAY)
512                 return base;
513         base += RTE_CACHE_LINE_ROUNDUP(size_bmp_array);
514
515         if (array == e_RTE_SCHED_SUBPORT_ARRAY_QUEUE_ARRAY)
516                 return base;
517         base += RTE_CACHE_LINE_ROUNDUP(size_queue_array);
518
519         return base;
520 }
521
522 static void
523 rte_sched_subport_config_qsize(struct rte_sched_subport *subport)
524 {
525         uint32_t i;
526
527         subport->qsize_add[0] = 0;
528
529         /* Strict prority traffic class */
530         for (i = 1; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
531                 subport->qsize_add[i] = subport->qsize_add[i-1] + subport->qsize[i-1];
532
533         /* Best-effort traffic class */
534         subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE + 1] =
535                 subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE] +
536                 subport->qsize[RTE_SCHED_TRAFFIC_CLASS_BE];
537         subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE + 2] =
538                 subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE + 1] +
539                 subport->qsize[RTE_SCHED_TRAFFIC_CLASS_BE];
540         subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE + 3] =
541                 subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE + 2] +
542                 subport->qsize[RTE_SCHED_TRAFFIC_CLASS_BE];
543
544         subport->qsize_sum = subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE + 3] +
545                 subport->qsize[RTE_SCHED_TRAFFIC_CLASS_BE];
546 }
547
548 static void
549 rte_sched_port_log_pipe_profile(struct rte_sched_subport *subport, uint32_t i)
550 {
551         struct rte_sched_pipe_profile *p = subport->pipe_profiles + i;
552
553         RTE_LOG(DEBUG, SCHED, "Low level config for pipe profile %u:\n"
554                 "       Token bucket: period = %u, credits per period = %u, size = %u\n"
555                 "       Traffic classes: period = %u,\n"
556                 "       credits per period = [%u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u]\n"
557                 "       Best-effort traffic class oversubscription: weight = %hhu\n"
558                 "       WRR cost: [%hhu, %hhu, %hhu, %hhu]\n",
559                 i,
560
561                 /* Token bucket */
562                 p->tb_period,
563                 p->tb_credits_per_period,
564                 p->tb_size,
565
566                 /* Traffic classes */
567                 p->tc_period,
568                 p->tc_credits_per_period[0],
569                 p->tc_credits_per_period[1],
570                 p->tc_credits_per_period[2],
571                 p->tc_credits_per_period[3],
572                 p->tc_credits_per_period[4],
573                 p->tc_credits_per_period[5],
574                 p->tc_credits_per_period[6],
575                 p->tc_credits_per_period[7],
576                 p->tc_credits_per_period[8],
577                 p->tc_credits_per_period[9],
578                 p->tc_credits_per_period[10],
579                 p->tc_credits_per_period[11],
580                 p->tc_credits_per_period[12],
581
582                 /* Best-effort traffic class oversubscription */
583                 p->tc_ov_weight,
584
585                 /* WRR */
586                 p->wrr_cost[0], p->wrr_cost[1], p->wrr_cost[2], p->wrr_cost[3]);
587 }
588
589 static inline uint64_t
590 rte_sched_time_ms_to_bytes(uint32_t time_ms, uint32_t rate)
591 {
592         uint64_t time = time_ms;
593
594         time = (time * rate) / 1000;
595
596         return time;
597 }
598
599 static void
600 rte_sched_pipe_profile_convert(struct rte_sched_subport *subport,
601         struct rte_sched_pipe_params *src,
602         struct rte_sched_pipe_profile *dst,
603         uint32_t rate)
604 {
605         uint32_t wrr_cost[RTE_SCHED_BE_QUEUES_PER_PIPE];
606         uint32_t lcd1, lcd2, lcd;
607         uint32_t i;
608
609         /* Token Bucket */
610         if (src->tb_rate == rate) {
611                 dst->tb_credits_per_period = 1;
612                 dst->tb_period = 1;
613         } else {
614                 double tb_rate = (double) src->tb_rate
615                                 / (double) rate;
616                 double d = RTE_SCHED_TB_RATE_CONFIG_ERR;
617
618                 rte_approx(tb_rate, d,
619                         &dst->tb_credits_per_period, &dst->tb_period);
620         }
621
622         dst->tb_size = src->tb_size;
623
624         /* Traffic Classes */
625         dst->tc_period = rte_sched_time_ms_to_bytes(src->tc_period,
626                                                 rate);
627
628         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
629                 if (subport->qsize[i])
630                         dst->tc_credits_per_period[i]
631                                 = rte_sched_time_ms_to_bytes(src->tc_period,
632                                         src->tc_rate[i]);
633
634         dst->tc_ov_weight = src->tc_ov_weight;
635
636         /* WRR queues */
637         wrr_cost[0] = src->wrr_weights[0];
638         wrr_cost[1] = src->wrr_weights[1];
639         wrr_cost[2] = src->wrr_weights[2];
640         wrr_cost[3] = src->wrr_weights[3];
641
642         lcd1 = rte_get_lcd(wrr_cost[0], wrr_cost[1]);
643         lcd2 = rte_get_lcd(wrr_cost[2], wrr_cost[3]);
644         lcd = rte_get_lcd(lcd1, lcd2);
645
646         wrr_cost[0] = lcd / wrr_cost[0];
647         wrr_cost[1] = lcd / wrr_cost[1];
648         wrr_cost[2] = lcd / wrr_cost[2];
649         wrr_cost[3] = lcd / wrr_cost[3];
650
651         dst->wrr_cost[0] = (uint8_t) wrr_cost[0];
652         dst->wrr_cost[1] = (uint8_t) wrr_cost[1];
653         dst->wrr_cost[2] = (uint8_t) wrr_cost[2];
654         dst->wrr_cost[3] = (uint8_t) wrr_cost[3];
655 }
656
657 static void
658 rte_sched_subport_config_pipe_profile_table(struct rte_sched_subport *subport,
659         struct rte_sched_subport_params *params, uint32_t rate)
660 {
661         uint32_t i;
662
663         for (i = 0; i < subport->n_pipe_profiles; i++) {
664                 struct rte_sched_pipe_params *src = params->pipe_profiles + i;
665                 struct rte_sched_pipe_profile *dst = subport->pipe_profiles + i;
666
667                 rte_sched_pipe_profile_convert(subport, src, dst, rate);
668                 rte_sched_port_log_pipe_profile(subport, i);
669         }
670
671         subport->pipe_tc_be_rate_max = 0;
672         for (i = 0; i < subport->n_pipe_profiles; i++) {
673                 struct rte_sched_pipe_params *src = params->pipe_profiles + i;
674                 uint32_t pipe_tc_be_rate = src->tc_rate[RTE_SCHED_TRAFFIC_CLASS_BE];
675
676                 if (subport->pipe_tc_be_rate_max < pipe_tc_be_rate)
677                         subport->pipe_tc_be_rate_max = pipe_tc_be_rate;
678         }
679 }
680
681 static int
682 rte_sched_subport_check_params(struct rte_sched_subport_params *params,
683         uint32_t n_max_pipes_per_subport,
684         uint32_t rate)
685 {
686         uint32_t i;
687
688         /* Check user parameters */
689         if (params == NULL) {
690                 RTE_LOG(ERR, SCHED,
691                         "%s: Incorrect value for parameter params\n", __func__);
692                 return -EINVAL;
693         }
694
695         if (params->tb_rate == 0 || params->tb_rate > rate) {
696                 RTE_LOG(ERR, SCHED,
697                         "%s: Incorrect value for tb rate\n", __func__);
698                 return -EINVAL;
699         }
700
701         if (params->tb_size == 0) {
702                 RTE_LOG(ERR, SCHED,
703                         "%s: Incorrect value for tb size\n", __func__);
704                 return -EINVAL;
705         }
706
707         /* qsize: if non-zero, power of 2,
708          * no bigger than 32K (due to 16-bit read/write pointers)
709          */
710         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) {
711                 uint16_t qsize = params->qsize[i];
712
713                 if (qsize != 0 && !rte_is_power_of_2(qsize)) {
714                         RTE_LOG(ERR, SCHED,
715                                 "%s: Incorrect value for qsize\n", __func__);
716                         return -EINVAL;
717                 }
718         }
719
720         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) {
721                 uint32_t tc_rate = params->tc_rate[i];
722                 uint16_t qsize = params->qsize[i];
723
724                 if ((qsize == 0 && tc_rate != 0) ||
725                         (qsize != 0 && tc_rate == 0) ||
726                         (tc_rate > params->tb_rate)) {
727                         RTE_LOG(ERR, SCHED,
728                                 "%s: Incorrect value for tc rate\n", __func__);
729                         return -EINVAL;
730                 }
731         }
732
733         if (params->qsize[RTE_SCHED_TRAFFIC_CLASS_BE] == 0 ||
734                 params->tc_rate[RTE_SCHED_TRAFFIC_CLASS_BE] == 0) {
735                 RTE_LOG(ERR, SCHED,
736                         "%s: Incorrect qsize or tc rate(best effort)\n", __func__);
737                 return -EINVAL;
738         }
739
740         if (params->tc_period == 0) {
741                 RTE_LOG(ERR, SCHED,
742                         "%s: Incorrect value for tc period\n", __func__);
743                 return -EINVAL;
744         }
745
746         /* n_pipes_per_subport: non-zero, power of 2 */
747         if (params->n_pipes_per_subport_enabled == 0 ||
748                 params->n_pipes_per_subport_enabled > n_max_pipes_per_subport ||
749             !rte_is_power_of_2(params->n_pipes_per_subport_enabled)) {
750                 RTE_LOG(ERR, SCHED,
751                         "%s: Incorrect value for pipes number\n", __func__);
752                 return -EINVAL;
753         }
754
755         /* pipe_profiles and n_pipe_profiles */
756         if (params->pipe_profiles == NULL ||
757             params->n_pipe_profiles == 0 ||
758                 params->n_max_pipe_profiles == 0 ||
759                 params->n_pipe_profiles > params->n_max_pipe_profiles) {
760                 RTE_LOG(ERR, SCHED,
761                         "%s: Incorrect value for pipe profiles\n", __func__);
762                 return -EINVAL;
763         }
764
765         for (i = 0; i < params->n_pipe_profiles; i++) {
766                 struct rte_sched_pipe_params *p = params->pipe_profiles + i;
767                 int status;
768
769                 status = pipe_profile_check(p, rate, &params->qsize[0]);
770                 if (status != 0) {
771                         RTE_LOG(ERR, SCHED,
772                                 "%s: Pipe profile check failed(%d)\n", __func__, status);
773                         return -EINVAL;
774                 }
775         }
776
777         return 0;
778 }
779
780 uint32_t
781 rte_sched_port_get_memory_footprint(struct rte_sched_port_params *port_params,
782         struct rte_sched_subport_params **subport_params)
783 {
784         uint32_t size0 = 0, size1 = 0, i;
785         int status;
786
787         status = rte_sched_port_check_params(port_params);
788         if (status != 0) {
789                 RTE_LOG(ERR, SCHED,
790                         "%s: Port scheduler port params check failed (%d)\n",
791                         __func__, status);
792
793                 return 0;
794         }
795
796         for (i = 0; i < port_params->n_subports_per_port; i++) {
797                 struct rte_sched_subport_params *sp = subport_params[i];
798
799                 status = rte_sched_subport_check_params(sp,
800                                 port_params->n_pipes_per_subport,
801                                 port_params->rate);
802                 if (status != 0) {
803                         RTE_LOG(ERR, SCHED,
804                                 "%s: Port scheduler subport params check failed (%d)\n",
805                                 __func__, status);
806
807                         return 0;
808                 }
809         }
810
811         size0 = sizeof(struct rte_sched_port);
812
813         for (i = 0; i < port_params->n_subports_per_port; i++) {
814                 struct rte_sched_subport_params *sp = subport_params[i];
815
816                 size1 += rte_sched_subport_get_array_base(sp,
817                                         e_RTE_SCHED_SUBPORT_ARRAY_TOTAL);
818         }
819
820         return size0 + size1;
821 }
822
823 struct rte_sched_port *
824 rte_sched_port_config(struct rte_sched_port_params *params)
825 {
826         struct rte_sched_port *port = NULL;
827         uint32_t size0, size1;
828         uint32_t cycles_per_byte;
829         uint32_t i, j;
830         int status;
831
832         status = rte_sched_port_check_params(params);
833         if (status != 0) {
834                 RTE_LOG(ERR, SCHED,
835                         "%s: Port scheduler params check failed (%d)\n",
836                         __func__, status);
837                 return NULL;
838         }
839
840         size0 = sizeof(struct rte_sched_port);
841         size1 = params->n_subports_per_port * sizeof(struct rte_sched_subport *);
842
843         /* Allocate memory to store the data structures */
844         port = rte_zmalloc_socket("qos_params", size0 + size1, RTE_CACHE_LINE_SIZE,
845                 params->socket);
846         if (port == NULL) {
847                 RTE_LOG(ERR, SCHED, "%s: Memory allocation fails\n", __func__);
848
849                 return NULL;
850         }
851
852         /* User parameters */
853         port->n_subports_per_port = params->n_subports_per_port;
854         port->n_pipes_per_subport = params->n_pipes_per_subport;
855         port->n_pipes_per_subport_log2 =
856                         __builtin_ctz(params->n_pipes_per_subport);
857         port->socket = params->socket;
858
859         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
860                 port->pipe_queue[i] = i;
861
862         for (i = 0, j = 0; i < RTE_SCHED_QUEUES_PER_PIPE; i++) {
863                 port->pipe_tc[i] = j;
864
865                 if (j < RTE_SCHED_TRAFFIC_CLASS_BE)
866                         j++;
867         }
868
869         for (i = 0, j = 0; i < RTE_SCHED_QUEUES_PER_PIPE; i++) {
870                 port->tc_queue[i] = j;
871
872                 if (i >= RTE_SCHED_TRAFFIC_CLASS_BE)
873                         j++;
874         }
875         port->rate = params->rate;
876         port->mtu = params->mtu + params->frame_overhead;
877         port->frame_overhead = params->frame_overhead;
878
879         /* Timing */
880         port->time_cpu_cycles = rte_get_tsc_cycles();
881         port->time_cpu_bytes = 0;
882         port->time = 0;
883
884         cycles_per_byte = (rte_get_tsc_hz() << RTE_SCHED_TIME_SHIFT)
885                 / params->rate;
886         port->inv_cycles_per_byte = rte_reciprocal_value(cycles_per_byte);
887
888         /* Grinders */
889         port->pkts_out = NULL;
890         port->n_pkts_out = 0;
891
892         return port;
893 }
894
895 static inline void
896 rte_sched_subport_free(struct rte_sched_port *port,
897         struct rte_sched_subport *subport)
898 {
899         uint32_t n_subport_pipe_queues;
900         uint32_t qindex;
901
902         if (subport == NULL)
903                 return;
904
905         n_subport_pipe_queues = rte_sched_subport_pipe_queues(subport);
906
907         /* Free enqueued mbufs */
908         for (qindex = 0; qindex < n_subport_pipe_queues; qindex++) {
909                 struct rte_mbuf **mbufs =
910                         rte_sched_subport_pipe_qbase(subport, qindex);
911                 uint16_t qsize = rte_sched_subport_pipe_qsize(port, subport, qindex);
912                 if (qsize != 0) {
913                         struct rte_sched_queue *queue = subport->queue + qindex;
914                         uint16_t qr = queue->qr & (qsize - 1);
915                         uint16_t qw = queue->qw & (qsize - 1);
916
917                         for (; qr != qw; qr = (qr + 1) & (qsize - 1))
918                                 rte_pktmbuf_free(mbufs[qr]);
919                 }
920         }
921
922         rte_bitmap_free(subport->bmp);
923 }
924
925 void
926 rte_sched_port_free(struct rte_sched_port *port)
927 {
928         uint32_t i;
929
930         /* Check user parameters */
931         if (port == NULL)
932                 return;
933
934         for (i = 0; i < port->n_subports_per_port; i++)
935                 rte_sched_subport_free(port, port->subports[i]);
936
937         rte_free(port);
938 }
939
940 static void
941 rte_sched_port_log_subport_config(struct rte_sched_port *port, uint32_t i)
942 {
943         struct rte_sched_subport *s = port->subports[i];
944
945         RTE_LOG(DEBUG, SCHED, "Low level config for subport %u:\n"
946                 "       Token bucket: period = %u, credits per period = %u, size = %u\n"
947                 "       Traffic classes: period = %u\n"
948                 "       credits per period = [%u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u]\n"
949                 "       Best effort traffic class oversubscription: wm min = %u, wm max = %u\n",
950                 i,
951
952                 /* Token bucket */
953                 s->tb_period,
954                 s->tb_credits_per_period,
955                 s->tb_size,
956
957                 /* Traffic classes */
958                 s->tc_period,
959                 s->tc_credits_per_period[0],
960                 s->tc_credits_per_period[1],
961                 s->tc_credits_per_period[2],
962                 s->tc_credits_per_period[3],
963                 s->tc_credits_per_period[4],
964                 s->tc_credits_per_period[5],
965                 s->tc_credits_per_period[6],
966                 s->tc_credits_per_period[7],
967                 s->tc_credits_per_period[8],
968                 s->tc_credits_per_period[9],
969                 s->tc_credits_per_period[10],
970                 s->tc_credits_per_period[11],
971                 s->tc_credits_per_period[12],
972
973                 /* Best effort traffic class oversubscription */
974                 s->tc_ov_wm_min,
975                 s->tc_ov_wm_max);
976 }
977
978 static void
979 rte_sched_free_memory(struct rte_sched_port *port, uint32_t n_subports)
980 {
981         uint32_t i;
982
983         for (i = 0; i < n_subports; i++) {
984                 struct rte_sched_subport *subport = port->subports[i];
985
986                 rte_sched_subport_free(port, subport);
987         }
988
989         rte_free(port);
990 }
991
992 int
993 rte_sched_subport_config(struct rte_sched_port *port,
994         uint32_t subport_id,
995         struct rte_sched_subport_params *params)
996 {
997         struct rte_sched_subport *s = NULL;
998         uint32_t n_subports = subport_id;
999         uint32_t n_subport_pipe_queues, i;
1000         uint32_t size0, size1, bmp_mem_size;
1001         int status;
1002
1003         /* Check user parameters */
1004         if (port == NULL) {
1005                 RTE_LOG(ERR, SCHED,
1006                         "%s: Incorrect value for parameter port\n", __func__);
1007                 return 0;
1008         }
1009
1010         if (subport_id >= port->n_subports_per_port) {
1011                 RTE_LOG(ERR, SCHED,
1012                         "%s: Incorrect value for subport id\n", __func__);
1013
1014                 rte_sched_free_memory(port, n_subports);
1015                 return -EINVAL;
1016         }
1017
1018         status = rte_sched_subport_check_params(params,
1019                 port->n_pipes_per_subport,
1020                 port->rate);
1021         if (status != 0) {
1022                 RTE_LOG(NOTICE, SCHED,
1023                         "%s: Port scheduler params check failed (%d)\n",
1024                         __func__, status);
1025
1026                 rte_sched_free_memory(port, n_subports);
1027                 return -EINVAL;
1028         }
1029
1030         /* Determine the amount of memory to allocate */
1031         size0 = sizeof(struct rte_sched_subport);
1032         size1 = rte_sched_subport_get_array_base(params,
1033                                 e_RTE_SCHED_SUBPORT_ARRAY_TOTAL);
1034
1035         /* Allocate memory to store the data structures */
1036         s = rte_zmalloc_socket("subport_params", size0 + size1,
1037                 RTE_CACHE_LINE_SIZE, port->socket);
1038         if (s == NULL) {
1039                 RTE_LOG(ERR, SCHED,
1040                         "%s: Memory allocation fails\n", __func__);
1041
1042                 rte_sched_free_memory(port, n_subports);
1043                 return -ENOMEM;
1044         }
1045
1046         n_subports++;
1047
1048         /* Port */
1049         port->subports[subport_id] = s;
1050
1051         /* Token Bucket (TB) */
1052         if (params->tb_rate == port->rate) {
1053                 s->tb_credits_per_period = 1;
1054                 s->tb_period = 1;
1055         } else {
1056                 double tb_rate = ((double) params->tb_rate) / ((double) port->rate);
1057                 double d = RTE_SCHED_TB_RATE_CONFIG_ERR;
1058
1059                 rte_approx(tb_rate, d, &s->tb_credits_per_period, &s->tb_period);
1060         }
1061
1062         s->tb_size = params->tb_size;
1063         s->tb_time = port->time;
1064         s->tb_credits = s->tb_size / 2;
1065
1066         /* Traffic Classes (TCs) */
1067         s->tc_period = rte_sched_time_ms_to_bytes(params->tc_period, port->rate);
1068         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) {
1069                 if (params->qsize[i])
1070                         s->tc_credits_per_period[i]
1071                                 = rte_sched_time_ms_to_bytes(params->tc_period,
1072                                         params->tc_rate[i]);
1073         }
1074         s->tc_time = port->time + s->tc_period;
1075         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
1076                 if (params->qsize[i])
1077                         s->tc_credits[i] = s->tc_credits_per_period[i];
1078
1079         /* compile time checks */
1080         RTE_BUILD_BUG_ON(RTE_SCHED_PORT_N_GRINDERS == 0);
1081         RTE_BUILD_BUG_ON(RTE_SCHED_PORT_N_GRINDERS &
1082                 (RTE_SCHED_PORT_N_GRINDERS - 1));
1083
1084         /* User parameters */
1085         s->n_pipes_per_subport_enabled = params->n_pipes_per_subport_enabled;
1086         memcpy(s->qsize, params->qsize, sizeof(params->qsize));
1087         s->n_pipe_profiles = params->n_pipe_profiles;
1088         s->n_max_pipe_profiles = params->n_max_pipe_profiles;
1089
1090 #ifdef RTE_SCHED_RED
1091         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) {
1092                 uint32_t j;
1093
1094                 for (j = 0; j < RTE_COLORS; j++) {
1095                         /* if min/max are both zero, then RED is disabled */
1096                         if ((params->red_params[i][j].min_th |
1097                              params->red_params[i][j].max_th) == 0) {
1098                                 continue;
1099                         }
1100
1101                         if (rte_red_config_init(&s->red_config[i][j],
1102                                 params->red_params[i][j].wq_log2,
1103                                 params->red_params[i][j].min_th,
1104                                 params->red_params[i][j].max_th,
1105                                 params->red_params[i][j].maxp_inv) != 0) {
1106                                 rte_sched_free_memory(port, n_subports);
1107
1108                                 RTE_LOG(NOTICE, SCHED,
1109                                 "%s: RED configuration init fails\n", __func__);
1110                                 return -EINVAL;
1111                         }
1112                 }
1113         }
1114 #endif
1115
1116         /* Scheduling loop detection */
1117         s->pipe_loop = RTE_SCHED_PIPE_INVALID;
1118         s->pipe_exhaustion = 0;
1119
1120         /* Grinders */
1121         s->busy_grinders = 0;
1122
1123         /* Queue base calculation */
1124         rte_sched_subport_config_qsize(s);
1125
1126         /* Large data structures */
1127         s->pipe = (struct rte_sched_pipe *)
1128                 (s->memory + rte_sched_subport_get_array_base(params,
1129                 e_RTE_SCHED_SUBPORT_ARRAY_PIPE));
1130         s->queue = (struct rte_sched_queue *)
1131                 (s->memory + rte_sched_subport_get_array_base(params,
1132                 e_RTE_SCHED_SUBPORT_ARRAY_QUEUE));
1133         s->queue_extra = (struct rte_sched_queue_extra *)
1134                 (s->memory + rte_sched_subport_get_array_base(params,
1135                 e_RTE_SCHED_SUBPORT_ARRAY_QUEUE_EXTRA));
1136         s->pipe_profiles = (struct rte_sched_pipe_profile *)
1137                 (s->memory + rte_sched_subport_get_array_base(params,
1138                 e_RTE_SCHED_SUBPORT_ARRAY_PIPE_PROFILES));
1139         s->bmp_array =  s->memory + rte_sched_subport_get_array_base(params,
1140                 e_RTE_SCHED_SUBPORT_ARRAY_BMP_ARRAY);
1141         s->queue_array = (struct rte_mbuf **)
1142                 (s->memory + rte_sched_subport_get_array_base(params,
1143                 e_RTE_SCHED_SUBPORT_ARRAY_QUEUE_ARRAY));
1144
1145         /* Pipe profile table */
1146         rte_sched_subport_config_pipe_profile_table(s, params, port->rate);
1147
1148         /* Bitmap */
1149         n_subport_pipe_queues = rte_sched_subport_pipe_queues(s);
1150         bmp_mem_size = rte_bitmap_get_memory_footprint(n_subport_pipe_queues);
1151         s->bmp = rte_bitmap_init(n_subport_pipe_queues, s->bmp_array,
1152                                 bmp_mem_size);
1153         if (s->bmp == NULL) {
1154                 RTE_LOG(ERR, SCHED,
1155                         "%s: Subport bitmap init error\n", __func__);
1156
1157                 rte_sched_free_memory(port, n_subports);
1158                 return -EINVAL;
1159         }
1160
1161         for (i = 0; i < RTE_SCHED_PORT_N_GRINDERS; i++)
1162                 s->grinder_base_bmp_pos[i] = RTE_SCHED_PIPE_INVALID;
1163
1164 #ifdef RTE_SCHED_SUBPORT_TC_OV
1165         /* TC oversubscription */
1166         s->tc_ov_wm_min = port->mtu;
1167         s->tc_ov_wm_max = rte_sched_time_ms_to_bytes(params->tc_period,
1168                                                      s->pipe_tc_be_rate_max);
1169         s->tc_ov_wm = s->tc_ov_wm_max;
1170         s->tc_ov_period_id = 0;
1171         s->tc_ov = 0;
1172         s->tc_ov_n = 0;
1173         s->tc_ov_rate = 0;
1174 #endif
1175
1176         rte_sched_port_log_subport_config(port, subport_id);
1177
1178         return 0;
1179 }
1180
1181 int
1182 rte_sched_pipe_config(struct rte_sched_port *port,
1183         uint32_t subport_id,
1184         uint32_t pipe_id,
1185         int32_t pipe_profile)
1186 {
1187         struct rte_sched_subport *s;
1188         struct rte_sched_pipe *p;
1189         struct rte_sched_pipe_profile *params;
1190         uint32_t n_subports = subport_id + 1;
1191         uint32_t deactivate, profile, i;
1192
1193         /* Check user parameters */
1194         profile = (uint32_t) pipe_profile;
1195         deactivate = (pipe_profile < 0);
1196
1197         if (port == NULL) {
1198                 RTE_LOG(ERR, SCHED,
1199                         "%s: Incorrect value for parameter port\n", __func__);
1200                 return -EINVAL;
1201         }
1202
1203         if (subport_id >= port->n_subports_per_port) {
1204                 RTE_LOG(ERR, SCHED,
1205                         "%s: Incorrect value for parameter subport id\n", __func__);
1206
1207                 rte_sched_free_memory(port, n_subports);
1208                 return -EINVAL;
1209         }
1210
1211         s = port->subports[subport_id];
1212         if (pipe_id >= s->n_pipes_per_subport_enabled) {
1213                 RTE_LOG(ERR, SCHED,
1214                         "%s: Incorrect value for parameter pipe id\n", __func__);
1215
1216                 rte_sched_free_memory(port, n_subports);
1217                 return -EINVAL;
1218         }
1219
1220         if (!deactivate && profile >= s->n_pipe_profiles) {
1221                 RTE_LOG(ERR, SCHED,
1222                         "%s: Incorrect value for parameter pipe profile\n", __func__);
1223
1224                 rte_sched_free_memory(port, n_subports);
1225                 return -EINVAL;
1226         }
1227
1228         /* Handle the case when pipe already has a valid configuration */
1229         p = s->pipe + pipe_id;
1230         if (p->tb_time) {
1231                 params = s->pipe_profiles + p->profile;
1232
1233                 double subport_tc_be_rate =
1234                         (double) s->tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASS_BE]
1235                         / (double) s->tc_period;
1236                 double pipe_tc_be_rate =
1237                         (double) params->tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASS_BE]
1238                         / (double) params->tc_period;
1239                 uint32_t tc_be_ov = s->tc_ov;
1240
1241                 /* Unplug pipe from its subport */
1242                 s->tc_ov_n -= params->tc_ov_weight;
1243                 s->tc_ov_rate -= pipe_tc_be_rate;
1244                 s->tc_ov = s->tc_ov_rate > subport_tc_be_rate;
1245
1246                 if (s->tc_ov != tc_be_ov) {
1247                         RTE_LOG(DEBUG, SCHED,
1248                                 "Subport %u Best-effort TC oversubscription is OFF (%.4lf >= %.4lf)\n",
1249                                 subport_id, subport_tc_be_rate, s->tc_ov_rate);
1250                 }
1251
1252                 /* Reset the pipe */
1253                 memset(p, 0, sizeof(struct rte_sched_pipe));
1254         }
1255
1256         if (deactivate)
1257                 return 0;
1258
1259         /* Apply the new pipe configuration */
1260         p->profile = profile;
1261         params = s->pipe_profiles + p->profile;
1262
1263         /* Token Bucket (TB) */
1264         p->tb_time = port->time;
1265         p->tb_credits = params->tb_size / 2;
1266
1267         /* Traffic Classes (TCs) */
1268         p->tc_time = port->time + params->tc_period;
1269
1270         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
1271                 if (s->qsize[i])
1272                         p->tc_credits[i] = params->tc_credits_per_period[i];
1273
1274         {
1275                 /* Subport best effort tc oversubscription */
1276                 double subport_tc_be_rate =
1277                         (double) s->tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASS_BE]
1278                         / (double) s->tc_period;
1279                 double pipe_tc_be_rate =
1280                         (double) params->tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASS_BE]
1281                         / (double) params->tc_period;
1282                 uint32_t tc_be_ov = s->tc_ov;
1283
1284                 s->tc_ov_n += params->tc_ov_weight;
1285                 s->tc_ov_rate += pipe_tc_be_rate;
1286                 s->tc_ov = s->tc_ov_rate > subport_tc_be_rate;
1287
1288                 if (s->tc_ov != tc_be_ov) {
1289                         RTE_LOG(DEBUG, SCHED,
1290                                 "Subport %u Best effort TC oversubscription is ON (%.4lf < %.4lf)\n",
1291                                 subport_id, subport_tc_be_rate, s->tc_ov_rate);
1292                 }
1293                 p->tc_ov_period_id = s->tc_ov_period_id;
1294                 p->tc_ov_credits = s->tc_ov_wm;
1295         }
1296
1297         return 0;
1298 }
1299
1300 int
1301 rte_sched_subport_pipe_profile_add(struct rte_sched_port *port,
1302         uint32_t subport_id,
1303         struct rte_sched_pipe_params *params,
1304         uint32_t *pipe_profile_id)
1305 {
1306         struct rte_sched_subport *s;
1307         struct rte_sched_pipe_profile *pp;
1308         uint32_t i;
1309         int status;
1310
1311         /* Port */
1312         if (port == NULL) {
1313                 RTE_LOG(ERR, SCHED,
1314                         "%s: Incorrect value for parameter port\n", __func__);
1315                 return -EINVAL;
1316         }
1317
1318         /* Subport id not exceeds the max limit */
1319         if (subport_id > port->n_subports_per_port) {
1320                 RTE_LOG(ERR, SCHED,
1321                         "%s: Incorrect value for subport id\n", __func__);
1322                 return -EINVAL;
1323         }
1324
1325         s = port->subports[subport_id];
1326
1327         /* Pipe profiles exceeds the max limit */
1328         if (s->n_pipe_profiles >= s->n_max_pipe_profiles) {
1329                 RTE_LOG(ERR, SCHED,
1330                         "%s: Number of pipe profiles exceeds the max limit\n", __func__);
1331                 return -EINVAL;
1332         }
1333
1334         /* Pipe params */
1335         status = pipe_profile_check(params, port->rate, &s->qsize[0]);
1336         if (status != 0) {
1337                 RTE_LOG(ERR, SCHED,
1338                         "%s: Pipe profile check failed(%d)\n", __func__, status);
1339                 return -EINVAL;
1340         }
1341
1342         pp = &s->pipe_profiles[s->n_pipe_profiles];
1343         rte_sched_pipe_profile_convert(s, params, pp, port->rate);
1344
1345         /* Pipe profile should not exists */
1346         for (i = 0; i < s->n_pipe_profiles; i++)
1347                 if (memcmp(s->pipe_profiles + i, pp, sizeof(*pp)) == 0) {
1348                         RTE_LOG(ERR, SCHED,
1349                                 "%s: Pipe profile exists\n", __func__);
1350                         return -EINVAL;
1351                 }
1352
1353         /* Pipe profile commit */
1354         *pipe_profile_id = s->n_pipe_profiles;
1355         s->n_pipe_profiles++;
1356
1357         if (s->pipe_tc_be_rate_max < params->tc_rate[RTE_SCHED_TRAFFIC_CLASS_BE])
1358                 s->pipe_tc_be_rate_max = params->tc_rate[RTE_SCHED_TRAFFIC_CLASS_BE];
1359
1360         rte_sched_port_log_pipe_profile(s, *pipe_profile_id);
1361
1362         return 0;
1363 }
1364
1365 static inline uint32_t
1366 rte_sched_port_qindex(struct rte_sched_port *port,
1367         uint32_t subport,
1368         uint32_t pipe,
1369         uint32_t traffic_class,
1370         uint32_t queue)
1371 {
1372         return ((subport & (port->n_subports_per_port - 1)) <<
1373                 (port->n_pipes_per_subport_log2 + 4)) |
1374                 ((pipe &
1375                 (port->subports[subport]->n_pipes_per_subport_enabled - 1)) << 4) |
1376                 ((rte_sched_port_pipe_queue(port, traffic_class) + queue) &
1377                 (RTE_SCHED_QUEUES_PER_PIPE - 1));
1378 }
1379
1380 void
1381 rte_sched_port_pkt_write(struct rte_sched_port *port,
1382                          struct rte_mbuf *pkt,
1383                          uint32_t subport, uint32_t pipe,
1384                          uint32_t traffic_class,
1385                          uint32_t queue, enum rte_color color)
1386 {
1387         uint32_t queue_id =
1388                 rte_sched_port_qindex(port, subport, pipe, traffic_class, queue);
1389
1390         rte_mbuf_sched_set(pkt, queue_id, traffic_class, (uint8_t)color);
1391 }
1392
1393 void
1394 rte_sched_port_pkt_read_tree_path(struct rte_sched_port *port,
1395                                   const struct rte_mbuf *pkt,
1396                                   uint32_t *subport, uint32_t *pipe,
1397                                   uint32_t *traffic_class, uint32_t *queue)
1398 {
1399         uint32_t queue_id = rte_mbuf_sched_queue_get(pkt);
1400
1401         *subport = queue_id >> (port->n_pipes_per_subport_log2 + 4);
1402         *pipe = (queue_id >> 4) &
1403                 (port->subports[*subport]->n_pipes_per_subport_enabled - 1);
1404         *traffic_class = rte_sched_port_pipe_tc(port, queue_id);
1405         *queue = rte_sched_port_tc_queue(port, queue_id);
1406 }
1407
1408 enum rte_color
1409 rte_sched_port_pkt_read_color(const struct rte_mbuf *pkt)
1410 {
1411         return (enum rte_color)rte_mbuf_sched_color_get(pkt);
1412 }
1413
1414 int
1415 rte_sched_subport_read_stats(struct rte_sched_port *port,
1416                              uint32_t subport_id,
1417                              struct rte_sched_subport_stats *stats,
1418                              uint32_t *tc_ov)
1419 {
1420         struct rte_sched_subport *s;
1421
1422         /* Check user parameters */
1423         if (port == NULL) {
1424                 RTE_LOG(ERR, SCHED,
1425                         "%s: Incorrect value for parameter port\n", __func__);
1426                 return -EINVAL;
1427         }
1428
1429         if (subport_id >= port->n_subports_per_port) {
1430                 RTE_LOG(ERR, SCHED,
1431                         "%s: Incorrect value for subport id\n", __func__);
1432                 return -EINVAL;
1433         }
1434
1435         if (stats == NULL) {
1436                 RTE_LOG(ERR, SCHED,
1437                         "%s: Incorrect value for parameter stats\n", __func__);
1438                 return -EINVAL;
1439         }
1440
1441         if (tc_ov == NULL) {
1442                 RTE_LOG(ERR, SCHED,
1443                         "%s: Incorrect value for tc_ov\n", __func__);
1444                 return -EINVAL;
1445         }
1446
1447         s = port->subports[subport_id];
1448
1449         /* Copy subport stats and clear */
1450         memcpy(stats, &s->stats, sizeof(struct rte_sched_subport_stats));
1451         memset(&s->stats, 0, sizeof(struct rte_sched_subport_stats));
1452
1453         /* Subport TC oversubscription status */
1454         *tc_ov = s->tc_ov;
1455
1456         return 0;
1457 }
1458
1459 int
1460 rte_sched_queue_read_stats(struct rte_sched_port *port,
1461         uint32_t queue_id,
1462         struct rte_sched_queue_stats *stats,
1463         uint16_t *qlen)
1464 {
1465         struct rte_sched_queue *q;
1466         struct rte_sched_queue_extra *qe;
1467
1468         /* Check user parameters */
1469         if (port == NULL) {
1470                 RTE_LOG(ERR, SCHED,
1471                         "%s: Incorrect value for parameter port\n", __func__);
1472                 return -EINVAL;
1473         }
1474
1475         if (queue_id >= rte_sched_port_queues_per_port(port)) {
1476                 RTE_LOG(ERR, SCHED,
1477                         "%s: Incorrect value for queue id\n", __func__);
1478                 return -EINVAL;
1479         }
1480
1481         if (stats == NULL) {
1482                 RTE_LOG(ERR, SCHED,
1483                         "%s: Incorrect value for parameter stats\n", __func__);
1484                 return -EINVAL;
1485         }
1486
1487         if (qlen == NULL) {
1488                 RTE_LOG(ERR, SCHED,
1489                         "%s: Incorrect value for parameter qlen\n", __func__);
1490                 return -EINVAL;
1491         }
1492         q = port->queue + queue_id;
1493         qe = port->queue_extra + queue_id;
1494
1495         /* Copy queue stats and clear */
1496         memcpy(stats, &qe->stats, sizeof(struct rte_sched_queue_stats));
1497         memset(&qe->stats, 0, sizeof(struct rte_sched_queue_stats));
1498
1499         /* Queue length */
1500         *qlen = q->qw - q->qr;
1501
1502         return 0;
1503 }
1504
1505 #ifdef RTE_SCHED_DEBUG
1506
1507 static inline int
1508 rte_sched_port_queue_is_empty(struct rte_sched_subport *subport,
1509         uint32_t qindex)
1510 {
1511         struct rte_sched_queue *queue = subport->queue + qindex;
1512
1513         return queue->qr == queue->qw;
1514 }
1515
1516 #endif /* RTE_SCHED_DEBUG */
1517
1518 #ifdef RTE_SCHED_COLLECT_STATS
1519
1520 static inline void
1521 rte_sched_port_update_subport_stats(struct rte_sched_port *port,
1522         struct rte_sched_subport *subport,
1523         uint32_t qindex,
1524         struct rte_mbuf *pkt)
1525 {
1526         uint32_t tc_index = rte_sched_port_pipe_tc(port, qindex);
1527         uint32_t pkt_len = pkt->pkt_len;
1528
1529         subport->stats.n_pkts_tc[tc_index] += 1;
1530         subport->stats.n_bytes_tc[tc_index] += pkt_len;
1531 }
1532
1533 #ifdef RTE_SCHED_RED
1534 static inline void
1535 rte_sched_port_update_subport_stats_on_drop(struct rte_sched_port *port,
1536         struct rte_sched_subport *subport,
1537         uint32_t qindex,
1538         struct rte_mbuf *pkt,
1539         uint32_t red)
1540 #else
1541 static inline void
1542 rte_sched_port_update_subport_stats_on_drop(struct rte_sched_port *port,
1543         struct rte_sched_subport *subport,
1544         uint32_t qindex,
1545         struct rte_mbuf *pkt,
1546         __rte_unused uint32_t red)
1547 #endif
1548 {
1549         uint32_t tc_index = rte_sched_port_pipe_tc(port, qindex);
1550         uint32_t pkt_len = pkt->pkt_len;
1551
1552         subport->stats.n_pkts_tc_dropped[tc_index] += 1;
1553         subport->stats.n_bytes_tc_dropped[tc_index] += pkt_len;
1554 #ifdef RTE_SCHED_RED
1555         subport->stats.n_pkts_red_dropped[tc_index] += red;
1556 #endif
1557 }
1558
1559 static inline void
1560 rte_sched_port_update_queue_stats(struct rte_sched_subport *subport,
1561         uint32_t qindex,
1562         struct rte_mbuf *pkt)
1563 {
1564         struct rte_sched_queue_extra *qe = subport->queue_extra + qindex;
1565         uint32_t pkt_len = pkt->pkt_len;
1566
1567         qe->stats.n_pkts += 1;
1568         qe->stats.n_bytes += pkt_len;
1569 }
1570
1571 #ifdef RTE_SCHED_RED
1572 static inline void
1573 rte_sched_port_update_queue_stats_on_drop(struct rte_sched_subport *subport,
1574         uint32_t qindex,
1575         struct rte_mbuf *pkt,
1576         uint32_t red)
1577 #else
1578 static inline void
1579 rte_sched_port_update_queue_stats_on_drop(struct rte_sched_subport *subport,
1580         uint32_t qindex,
1581         struct rte_mbuf *pkt,
1582         __rte_unused uint32_t red)
1583 #endif
1584 {
1585         struct rte_sched_queue_extra *qe = subport->queue_extra + qindex;
1586         uint32_t pkt_len = pkt->pkt_len;
1587
1588         qe->stats.n_pkts_dropped += 1;
1589         qe->stats.n_bytes_dropped += pkt_len;
1590 #ifdef RTE_SCHED_RED
1591         qe->stats.n_pkts_red_dropped += red;
1592 #endif
1593 }
1594
1595 #endif /* RTE_SCHED_COLLECT_STATS */
1596
1597 #ifdef RTE_SCHED_RED
1598
1599 static inline int
1600 rte_sched_port_red_drop(struct rte_sched_port *port,
1601         struct rte_sched_subport *subport,
1602         struct rte_mbuf *pkt,
1603         uint32_t qindex,
1604         uint16_t qlen)
1605 {
1606         struct rte_sched_queue_extra *qe;
1607         struct rte_red_config *red_cfg;
1608         struct rte_red *red;
1609         uint32_t tc_index;
1610         enum rte_color color;
1611
1612         tc_index = rte_sched_port_pipe_tc(port, qindex);
1613         color = rte_sched_port_pkt_read_color(pkt);
1614         red_cfg = &subport->red_config[tc_index][color];
1615
1616         if ((red_cfg->min_th | red_cfg->max_th) == 0)
1617                 return 0;
1618
1619         qe = subport->queue_extra + qindex;
1620         red = &qe->red;
1621
1622         return rte_red_enqueue(red_cfg, red, qlen, port->time);
1623 }
1624
1625 static inline void
1626 rte_sched_port_set_queue_empty_timestamp(struct rte_sched_port *port,
1627         struct rte_sched_subport *subport, uint32_t qindex)
1628 {
1629         struct rte_sched_queue_extra *qe = subport->queue_extra + qindex;
1630         struct rte_red *red = &qe->red;
1631
1632         rte_red_mark_queue_empty(red, port->time);
1633 }
1634
1635 #else
1636
1637 static inline int rte_sched_port_red_drop(struct rte_sched_port *port __rte_unused,
1638         struct rte_sched_subport *subport __rte_unused,
1639         struct rte_mbuf *pkt __rte_unused,
1640         uint32_t qindex __rte_unused,
1641         uint16_t qlen __rte_unused)
1642 {
1643         return 0;
1644 }
1645
1646 #define rte_sched_port_set_queue_empty_timestamp(port, subport, qindex)
1647
1648 #endif /* RTE_SCHED_RED */
1649
1650 #ifdef RTE_SCHED_DEBUG
1651
1652 static inline void
1653 debug_check_queue_slab(struct rte_sched_subport *subport, uint32_t bmp_pos,
1654                        uint64_t bmp_slab)
1655 {
1656         uint64_t mask;
1657         uint32_t i, panic;
1658
1659         if (bmp_slab == 0)
1660                 rte_panic("Empty slab at position %u\n", bmp_pos);
1661
1662         panic = 0;
1663         for (i = 0, mask = 1; i < 64; i++, mask <<= 1) {
1664                 if (mask & bmp_slab) {
1665                         if (rte_sched_port_queue_is_empty(subport, bmp_pos + i)) {
1666                                 printf("Queue %u (slab offset %u) is empty\n", bmp_pos + i, i);
1667                                 panic = 1;
1668                         }
1669                 }
1670         }
1671
1672         if (panic)
1673                 rte_panic("Empty queues in slab 0x%" PRIx64 "starting at position %u\n",
1674                         bmp_slab, bmp_pos);
1675 }
1676
1677 #endif /* RTE_SCHED_DEBUG */
1678
1679 static inline struct rte_sched_subport *
1680 rte_sched_port_subport(struct rte_sched_port *port,
1681         struct rte_mbuf *pkt)
1682 {
1683         uint32_t queue_id = rte_mbuf_sched_queue_get(pkt);
1684         uint32_t subport_id = queue_id >> (port->n_pipes_per_subport_log2 + 4);
1685
1686         return port->subports[subport_id];
1687 }
1688
1689 static inline uint32_t
1690 rte_sched_port_enqueue_qptrs_prefetch0(struct rte_sched_subport *subport,
1691         struct rte_mbuf *pkt, uint32_t subport_qmask)
1692 {
1693         struct rte_sched_queue *q;
1694 #ifdef RTE_SCHED_COLLECT_STATS
1695         struct rte_sched_queue_extra *qe;
1696 #endif
1697         uint32_t qindex = rte_mbuf_sched_queue_get(pkt);
1698         uint32_t subport_queue_id = subport_qmask & qindex;
1699
1700         q = subport->queue + subport_queue_id;
1701         rte_prefetch0(q);
1702 #ifdef RTE_SCHED_COLLECT_STATS
1703         qe = subport->queue_extra + subport_queue_id;
1704         rte_prefetch0(qe);
1705 #endif
1706
1707         return subport_queue_id;
1708 }
1709
1710 static inline void
1711 rte_sched_port_enqueue_qwa_prefetch0(struct rte_sched_port *port,
1712         struct rte_sched_subport *subport,
1713         uint32_t qindex,
1714         struct rte_mbuf **qbase)
1715 {
1716         struct rte_sched_queue *q;
1717         struct rte_mbuf **q_qw;
1718         uint16_t qsize;
1719
1720         q = subport->queue + qindex;
1721         qsize = rte_sched_subport_pipe_qsize(port, subport, qindex);
1722         q_qw = qbase + (q->qw & (qsize - 1));
1723
1724         rte_prefetch0(q_qw);
1725         rte_bitmap_prefetch0(subport->bmp, qindex);
1726 }
1727
1728 static inline int
1729 rte_sched_port_enqueue_qwa(struct rte_sched_port *port,
1730         struct rte_sched_subport *subport,
1731         uint32_t qindex,
1732         struct rte_mbuf **qbase,
1733         struct rte_mbuf *pkt)
1734 {
1735         struct rte_sched_queue *q;
1736         uint16_t qsize;
1737         uint16_t qlen;
1738
1739         q = subport->queue + qindex;
1740         qsize = rte_sched_subport_pipe_qsize(port, subport, qindex);
1741         qlen = q->qw - q->qr;
1742
1743         /* Drop the packet (and update drop stats) when queue is full */
1744         if (unlikely(rte_sched_port_red_drop(port, subport, pkt, qindex, qlen) ||
1745                      (qlen >= qsize))) {
1746                 rte_pktmbuf_free(pkt);
1747 #ifdef RTE_SCHED_COLLECT_STATS
1748                 rte_sched_port_update_subport_stats_on_drop(port, subport,
1749                         qindex, pkt, qlen < qsize);
1750                 rte_sched_port_update_queue_stats_on_drop(subport, qindex, pkt,
1751                         qlen < qsize);
1752 #endif
1753                 return 0;
1754         }
1755
1756         /* Enqueue packet */
1757         qbase[q->qw & (qsize - 1)] = pkt;
1758         q->qw++;
1759
1760         /* Activate queue in the subport bitmap */
1761         rte_bitmap_set(subport->bmp, qindex);
1762
1763         /* Statistics */
1764 #ifdef RTE_SCHED_COLLECT_STATS
1765         rte_sched_port_update_subport_stats(port, subport, qindex, pkt);
1766         rte_sched_port_update_queue_stats(subport, qindex, pkt);
1767 #endif
1768
1769         return 1;
1770 }
1771
1772
1773 /*
1774  * The enqueue function implements a 4-level pipeline with each stage
1775  * processing two different packets. The purpose of using a pipeline
1776  * is to hide the latency of prefetching the data structures. The
1777  * naming convention is presented in the diagram below:
1778  *
1779  *   p00  _______   p10  _______   p20  _______   p30  _______
1780  * ----->|       |----->|       |----->|       |----->|       |----->
1781  *       |   0   |      |   1   |      |   2   |      |   3   |
1782  * ----->|_______|----->|_______|----->|_______|----->|_______|----->
1783  *   p01            p11            p21            p31
1784  *
1785  */
1786 int
1787 rte_sched_port_enqueue(struct rte_sched_port *port, struct rte_mbuf **pkts,
1788                        uint32_t n_pkts)
1789 {
1790         struct rte_mbuf *pkt00, *pkt01, *pkt10, *pkt11, *pkt20, *pkt21,
1791                 *pkt30, *pkt31, *pkt_last;
1792         struct rte_mbuf **q00_base, **q01_base, **q10_base, **q11_base,
1793                 **q20_base, **q21_base, **q30_base, **q31_base, **q_last_base;
1794         struct rte_sched_subport *subport00, *subport01, *subport10, *subport11,
1795                 *subport20, *subport21, *subport30, *subport31, *subport_last;
1796         uint32_t q00, q01, q10, q11, q20, q21, q30, q31, q_last;
1797         uint32_t r00, r01, r10, r11, r20, r21, r30, r31, r_last;
1798         uint32_t subport_qmask;
1799         uint32_t result, i;
1800
1801         result = 0;
1802         subport_qmask = (1 << (port->n_pipes_per_subport_log2 + 4)) - 1;
1803
1804         /*
1805          * Less then 6 input packets available, which is not enough to
1806          * feed the pipeline
1807          */
1808         if (unlikely(n_pkts < 6)) {
1809                 struct rte_sched_subport *subports[5];
1810                 struct rte_mbuf **q_base[5];
1811                 uint32_t q[5];
1812
1813                 /* Prefetch the mbuf structure of each packet */
1814                 for (i = 0; i < n_pkts; i++)
1815                         rte_prefetch0(pkts[i]);
1816
1817                 /* Prefetch the subport structure for each packet */
1818                 for (i = 0; i < n_pkts; i++)
1819                         subports[i] = rte_sched_port_subport(port, pkts[i]);
1820
1821                 /* Prefetch the queue structure for each queue */
1822                 for (i = 0; i < n_pkts; i++)
1823                         q[i] = rte_sched_port_enqueue_qptrs_prefetch0(subports[i],
1824                                         pkts[i], subport_qmask);
1825
1826                 /* Prefetch the write pointer location of each queue */
1827                 for (i = 0; i < n_pkts; i++) {
1828                         q_base[i] = rte_sched_subport_pipe_qbase(subports[i], q[i]);
1829                         rte_sched_port_enqueue_qwa_prefetch0(port, subports[i],
1830                                 q[i], q_base[i]);
1831                 }
1832
1833                 /* Write each packet to its queue */
1834                 for (i = 0; i < n_pkts; i++)
1835                         result += rte_sched_port_enqueue_qwa(port, subports[i],
1836                                                 q[i], q_base[i], pkts[i]);
1837
1838                 return result;
1839         }
1840
1841         /* Feed the first 3 stages of the pipeline (6 packets needed) */
1842         pkt20 = pkts[0];
1843         pkt21 = pkts[1];
1844         rte_prefetch0(pkt20);
1845         rte_prefetch0(pkt21);
1846
1847         pkt10 = pkts[2];
1848         pkt11 = pkts[3];
1849         rte_prefetch0(pkt10);
1850         rte_prefetch0(pkt11);
1851
1852         subport20 = rte_sched_port_subport(port, pkt20);
1853         subport21 = rte_sched_port_subport(port, pkt21);
1854         q20 = rte_sched_port_enqueue_qptrs_prefetch0(subport20,
1855                         pkt20, subport_qmask);
1856         q21 = rte_sched_port_enqueue_qptrs_prefetch0(subport21,
1857                         pkt21, subport_qmask);
1858
1859         pkt00 = pkts[4];
1860         pkt01 = pkts[5];
1861         rte_prefetch0(pkt00);
1862         rte_prefetch0(pkt01);
1863
1864         subport10 = rte_sched_port_subport(port, pkt10);
1865         subport11 = rte_sched_port_subport(port, pkt11);
1866         q10 = rte_sched_port_enqueue_qptrs_prefetch0(subport10,
1867                         pkt10, subport_qmask);
1868         q11 = rte_sched_port_enqueue_qptrs_prefetch0(subport11,
1869                         pkt11, subport_qmask);
1870
1871         q20_base = rte_sched_subport_pipe_qbase(subport20, q20);
1872         q21_base = rte_sched_subport_pipe_qbase(subport21, q21);
1873         rte_sched_port_enqueue_qwa_prefetch0(port, subport20, q20, q20_base);
1874         rte_sched_port_enqueue_qwa_prefetch0(port, subport21, q21, q21_base);
1875
1876         /* Run the pipeline */
1877         for (i = 6; i < (n_pkts & (~1)); i += 2) {
1878                 /* Propagate stage inputs */
1879                 pkt30 = pkt20;
1880                 pkt31 = pkt21;
1881                 pkt20 = pkt10;
1882                 pkt21 = pkt11;
1883                 pkt10 = pkt00;
1884                 pkt11 = pkt01;
1885                 q30 = q20;
1886                 q31 = q21;
1887                 q20 = q10;
1888                 q21 = q11;
1889                 subport30 = subport20;
1890                 subport31 = subport21;
1891                 subport20 = subport10;
1892                 subport21 = subport11;
1893                 q30_base = q20_base;
1894                 q31_base = q21_base;
1895
1896                 /* Stage 0: Get packets in */
1897                 pkt00 = pkts[i];
1898                 pkt01 = pkts[i + 1];
1899                 rte_prefetch0(pkt00);
1900                 rte_prefetch0(pkt01);
1901
1902                 /* Stage 1: Prefetch subport and queue structure storing queue pointers */
1903                 subport10 = rte_sched_port_subport(port, pkt10);
1904                 subport11 = rte_sched_port_subport(port, pkt11);
1905                 q10 = rte_sched_port_enqueue_qptrs_prefetch0(subport10,
1906                                 pkt10, subport_qmask);
1907                 q11 = rte_sched_port_enqueue_qptrs_prefetch0(subport11,
1908                                 pkt11, subport_qmask);
1909
1910                 /* Stage 2: Prefetch queue write location */
1911                 q20_base = rte_sched_subport_pipe_qbase(subport20, q20);
1912                 q21_base = rte_sched_subport_pipe_qbase(subport21, q21);
1913                 rte_sched_port_enqueue_qwa_prefetch0(port, subport20, q20, q20_base);
1914                 rte_sched_port_enqueue_qwa_prefetch0(port, subport21, q21, q21_base);
1915
1916                 /* Stage 3: Write packet to queue and activate queue */
1917                 r30 = rte_sched_port_enqueue_qwa(port, subport30,
1918                                 q30, q30_base, pkt30);
1919                 r31 = rte_sched_port_enqueue_qwa(port, subport31,
1920                                 q31, q31_base, pkt31);
1921                 result += r30 + r31;
1922         }
1923
1924         /*
1925          * Drain the pipeline (exactly 6 packets).
1926          * Handle the last packet in the case
1927          * of an odd number of input packets.
1928          */
1929         pkt_last = pkts[n_pkts - 1];
1930         rte_prefetch0(pkt_last);
1931
1932         subport00 = rte_sched_port_subport(port, pkt00);
1933         subport01 = rte_sched_port_subport(port, pkt01);
1934         q00 = rte_sched_port_enqueue_qptrs_prefetch0(subport00,
1935                         pkt00, subport_qmask);
1936         q01 = rte_sched_port_enqueue_qptrs_prefetch0(subport01,
1937                         pkt01, subport_qmask);
1938
1939         q10_base = rte_sched_subport_pipe_qbase(subport10, q10);
1940         q11_base = rte_sched_subport_pipe_qbase(subport11, q11);
1941         rte_sched_port_enqueue_qwa_prefetch0(port, subport10, q10, q10_base);
1942         rte_sched_port_enqueue_qwa_prefetch0(port, subport11, q11, q11_base);
1943
1944         r20 = rte_sched_port_enqueue_qwa(port, subport20,
1945                         q20, q20_base, pkt20);
1946         r21 = rte_sched_port_enqueue_qwa(port, subport21,
1947                         q21, q21_base, pkt21);
1948         result += r20 + r21;
1949
1950         subport_last = rte_sched_port_subport(port, pkt_last);
1951         q_last = rte_sched_port_enqueue_qptrs_prefetch0(subport_last,
1952                                 pkt_last, subport_qmask);
1953
1954         q00_base = rte_sched_subport_pipe_qbase(subport00, q00);
1955         q01_base = rte_sched_subport_pipe_qbase(subport01, q01);
1956         rte_sched_port_enqueue_qwa_prefetch0(port, subport00, q00, q00_base);
1957         rte_sched_port_enqueue_qwa_prefetch0(port, subport01, q01, q01_base);
1958
1959         r10 = rte_sched_port_enqueue_qwa(port, subport10, q10,
1960                         q10_base, pkt10);
1961         r11 = rte_sched_port_enqueue_qwa(port, subport11, q11,
1962                         q11_base, pkt11);
1963         result += r10 + r11;
1964
1965         q_last_base = rte_sched_subport_pipe_qbase(subport_last, q_last);
1966         rte_sched_port_enqueue_qwa_prefetch0(port, subport_last,
1967                 q_last, q_last_base);
1968
1969         r00 = rte_sched_port_enqueue_qwa(port, subport00, q00,
1970                         q00_base, pkt00);
1971         r01 = rte_sched_port_enqueue_qwa(port, subport01, q01,
1972                         q01_base, pkt01);
1973         result += r00 + r01;
1974
1975         if (n_pkts & 1) {
1976                 r_last = rte_sched_port_enqueue_qwa(port, subport_last,
1977                                         q_last, q_last_base, pkt_last);
1978                 result += r_last;
1979         }
1980
1981         return result;
1982 }
1983
1984 #ifndef RTE_SCHED_SUBPORT_TC_OV
1985
1986 static inline void
1987 grinder_credits_update(struct rte_sched_port *port,
1988         struct rte_sched_subport *subport, uint32_t pos)
1989 {
1990         struct rte_sched_grinder *grinder = subport->grinder + pos;
1991         struct rte_sched_pipe *pipe = grinder->pipe;
1992         struct rte_sched_pipe_profile *params = grinder->pipe_params;
1993         uint64_t n_periods;
1994         uint32_t i;
1995
1996         /* Subport TB */
1997         n_periods = (port->time - subport->tb_time) / subport->tb_period;
1998         subport->tb_credits += n_periods * subport->tb_credits_per_period;
1999         subport->tb_credits = rte_sched_min_val_2_u32(subport->tb_credits, subport->tb_size);
2000         subport->tb_time += n_periods * subport->tb_period;
2001
2002         /* Pipe TB */
2003         n_periods = (port->time - pipe->tb_time) / params->tb_period;
2004         pipe->tb_credits += n_periods * params->tb_credits_per_period;
2005         pipe->tb_credits = rte_sched_min_val_2_u32(pipe->tb_credits, params->tb_size);
2006         pipe->tb_time += n_periods * params->tb_period;
2007
2008         /* Subport TCs */
2009         if (unlikely(port->time >= subport->tc_time)) {
2010                 for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
2011                         subport->tc_credits[i] = subport->tc_credits_per_period[i];
2012
2013                 subport->tc_time = port->time + subport->tc_period;
2014         }
2015
2016         /* Pipe TCs */
2017         if (unlikely(port->time >= pipe->tc_time)) {
2018                 for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
2019                         pipe->tc_credits[i] = params->tc_credits_per_period[i];
2020
2021                 pipe->tc_time = port->time + params->tc_period;
2022         }
2023 }
2024
2025 #else
2026
2027 static inline uint32_t
2028 grinder_tc_ov_credits_update(struct rte_sched_port *port,
2029         struct rte_sched_subport *subport)
2030 {
2031         uint32_t tc_ov_consumption[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
2032         uint32_t tc_consumption = 0, tc_ov_consumption_max;
2033         uint32_t tc_ov_wm = subport->tc_ov_wm;
2034         uint32_t i;
2035
2036         if (subport->tc_ov == 0)
2037                 return subport->tc_ov_wm_max;
2038
2039         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASS_BE; i++) {
2040                 tc_ov_consumption[i] =
2041                         subport->tc_credits_per_period[i] - subport->tc_credits[i];
2042                 tc_consumption += tc_ov_consumption[i];
2043         }
2044
2045         tc_ov_consumption[RTE_SCHED_TRAFFIC_CLASS_BE] =
2046                 subport->tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASS_BE] -
2047                 subport->tc_credits[RTE_SCHED_TRAFFIC_CLASS_BE];
2048
2049         tc_ov_consumption_max =
2050                 subport->tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASS_BE] -
2051                         tc_consumption;
2052
2053         if (tc_ov_consumption[RTE_SCHED_TRAFFIC_CLASS_BE] >
2054                 (tc_ov_consumption_max - port->mtu)) {
2055                 tc_ov_wm  -= tc_ov_wm >> 7;
2056                 if (tc_ov_wm < subport->tc_ov_wm_min)
2057                         tc_ov_wm = subport->tc_ov_wm_min;
2058
2059                 return tc_ov_wm;
2060         }
2061
2062         tc_ov_wm += (tc_ov_wm >> 7) + 1;
2063         if (tc_ov_wm > subport->tc_ov_wm_max)
2064                 tc_ov_wm = subport->tc_ov_wm_max;
2065
2066         return tc_ov_wm;
2067 }
2068
2069 static inline void
2070 grinder_credits_update(struct rte_sched_port *port,
2071         struct rte_sched_subport *subport, uint32_t pos)
2072 {
2073         struct rte_sched_grinder *grinder = subport->grinder + pos;
2074         struct rte_sched_pipe *pipe = grinder->pipe;
2075         struct rte_sched_pipe_profile *params = grinder->pipe_params;
2076         uint64_t n_periods;
2077         uint32_t i;
2078
2079         /* Subport TB */
2080         n_periods = (port->time - subport->tb_time) / subport->tb_period;
2081         subport->tb_credits += n_periods * subport->tb_credits_per_period;
2082         subport->tb_credits = rte_sched_min_val_2_u32(subport->tb_credits, subport->tb_size);
2083         subport->tb_time += n_periods * subport->tb_period;
2084
2085         /* Pipe TB */
2086         n_periods = (port->time - pipe->tb_time) / params->tb_period;
2087         pipe->tb_credits += n_periods * params->tb_credits_per_period;
2088         pipe->tb_credits = rte_sched_min_val_2_u32(pipe->tb_credits, params->tb_size);
2089         pipe->tb_time += n_periods * params->tb_period;
2090
2091         /* Subport TCs */
2092         if (unlikely(port->time >= subport->tc_time)) {
2093                 subport->tc_ov_wm = grinder_tc_ov_credits_update(port, subport);
2094
2095                 for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
2096                         subport->tc_credits[i] = subport->tc_credits_per_period[i];
2097
2098                 subport->tc_time = port->time + subport->tc_period;
2099                 subport->tc_ov_period_id++;
2100         }
2101
2102         /* Pipe TCs */
2103         if (unlikely(port->time >= pipe->tc_time)) {
2104                 for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
2105                         pipe->tc_credits[i] = params->tc_credits_per_period[i];
2106                 pipe->tc_time = port->time + params->tc_period;
2107         }
2108
2109         /* Pipe TCs - Oversubscription */
2110         if (unlikely(pipe->tc_ov_period_id != subport->tc_ov_period_id)) {
2111                 pipe->tc_ov_credits = subport->tc_ov_wm * params->tc_ov_weight;
2112
2113                 pipe->tc_ov_period_id = subport->tc_ov_period_id;
2114         }
2115 }
2116
2117 #endif /* RTE_SCHED_TS_CREDITS_UPDATE, RTE_SCHED_SUBPORT_TC_OV */
2118
2119
2120 #ifndef RTE_SCHED_SUBPORT_TC_OV
2121
2122 static inline int
2123 grinder_credits_check(struct rte_sched_port *port,
2124         struct rte_sched_subport *subport, uint32_t pos)
2125 {
2126         struct rte_sched_grinder *grinder = subport->grinder + pos;
2127         struct rte_sched_pipe *pipe = grinder->pipe;
2128         struct rte_mbuf *pkt = grinder->pkt;
2129         uint32_t tc_index = grinder->tc_index;
2130         uint32_t pkt_len = pkt->pkt_len + port->frame_overhead;
2131         uint32_t subport_tb_credits = subport->tb_credits;
2132         uint32_t subport_tc_credits = subport->tc_credits[tc_index];
2133         uint32_t pipe_tb_credits = pipe->tb_credits;
2134         uint32_t pipe_tc_credits = pipe->tc_credits[tc_index];
2135         int enough_credits;
2136
2137         /* Check queue credits */
2138         enough_credits = (pkt_len <= subport_tb_credits) &&
2139                 (pkt_len <= subport_tc_credits) &&
2140                 (pkt_len <= pipe_tb_credits) &&
2141                 (pkt_len <= pipe_tc_credits);
2142
2143         if (!enough_credits)
2144                 return 0;
2145
2146         /* Update port credits */
2147         subport->tb_credits -= pkt_len;
2148         subport->tc_credits[tc_index] -= pkt_len;
2149         pipe->tb_credits -= pkt_len;
2150         pipe->tc_credits[tc_index] -= pkt_len;
2151
2152         return 1;
2153 }
2154
2155 #else
2156
2157 static inline int
2158 grinder_credits_check(struct rte_sched_port *port,
2159         struct rte_sched_subport *subport, uint32_t pos)
2160 {
2161         struct rte_sched_grinder *grinder = subport->grinder + pos;
2162         struct rte_sched_pipe *pipe = grinder->pipe;
2163         struct rte_mbuf *pkt = grinder->pkt;
2164         uint32_t tc_index = grinder->tc_index;
2165         uint32_t pkt_len = pkt->pkt_len + port->frame_overhead;
2166         uint32_t subport_tb_credits = subport->tb_credits;
2167         uint32_t subport_tc_credits = subport->tc_credits[tc_index];
2168         uint32_t pipe_tb_credits = pipe->tb_credits;
2169         uint32_t pipe_tc_credits = pipe->tc_credits[tc_index];
2170         uint32_t pipe_tc_ov_mask1[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
2171         uint32_t pipe_tc_ov_mask2[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE] = {0};
2172         uint32_t pipe_tc_ov_credits, i;
2173         int enough_credits;
2174
2175         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
2176                 pipe_tc_ov_mask1[i] = UINT32_MAX;
2177
2178         pipe_tc_ov_mask1[RTE_SCHED_TRAFFIC_CLASS_BE] = pipe->tc_ov_credits;
2179         pipe_tc_ov_mask2[RTE_SCHED_TRAFFIC_CLASS_BE] = UINT32_MAX;
2180         pipe_tc_ov_credits = pipe_tc_ov_mask1[tc_index];
2181
2182         /* Check pipe and subport credits */
2183         enough_credits = (pkt_len <= subport_tb_credits) &&
2184                 (pkt_len <= subport_tc_credits) &&
2185                 (pkt_len <= pipe_tb_credits) &&
2186                 (pkt_len <= pipe_tc_credits) &&
2187                 (pkt_len <= pipe_tc_ov_credits);
2188
2189         if (!enough_credits)
2190                 return 0;
2191
2192         /* Update pipe and subport credits */
2193         subport->tb_credits -= pkt_len;
2194         subport->tc_credits[tc_index] -= pkt_len;
2195         pipe->tb_credits -= pkt_len;
2196         pipe->tc_credits[tc_index] -= pkt_len;
2197         pipe->tc_ov_credits -= pipe_tc_ov_mask2[tc_index] & pkt_len;
2198
2199         return 1;
2200 }
2201
2202 #endif /* RTE_SCHED_SUBPORT_TC_OV */
2203
2204
2205 static inline int
2206 grinder_schedule(struct rte_sched_port *port,
2207         struct rte_sched_subport *subport, uint32_t pos)
2208 {
2209         struct rte_sched_grinder *grinder = subport->grinder + pos;
2210         struct rte_sched_queue *queue = grinder->queue[grinder->qpos];
2211         struct rte_mbuf *pkt = grinder->pkt;
2212         uint32_t pkt_len = pkt->pkt_len + port->frame_overhead;
2213         uint32_t be_tc_active;
2214
2215         if (!grinder_credits_check(port, subport, pos))
2216                 return 0;
2217
2218         /* Advance port time */
2219         port->time += pkt_len;
2220
2221         /* Send packet */
2222         port->pkts_out[port->n_pkts_out++] = pkt;
2223         queue->qr++;
2224
2225         be_tc_active = (grinder->tc_index == RTE_SCHED_TRAFFIC_CLASS_BE) ? ~0x0 : 0x0;
2226         grinder->wrr_tokens[grinder->qpos] +=
2227                 (pkt_len * grinder->wrr_cost[grinder->qpos]) & be_tc_active;
2228
2229         if (queue->qr == queue->qw) {
2230                 uint32_t qindex = grinder->qindex[grinder->qpos];
2231
2232                 rte_bitmap_clear(subport->bmp, qindex);
2233                 grinder->qmask &= ~(1 << grinder->qpos);
2234                 if (be_tc_active)
2235                         grinder->wrr_mask[grinder->qpos] = 0;
2236                 rte_sched_port_set_queue_empty_timestamp(port, subport, qindex);
2237         }
2238
2239         /* Reset pipe loop detection */
2240         subport->pipe_loop = RTE_SCHED_PIPE_INVALID;
2241         grinder->productive = 1;
2242
2243         return 1;
2244 }
2245
2246 #ifdef SCHED_VECTOR_SSE4
2247
2248 static inline int
2249 grinder_pipe_exists(struct rte_sched_subport *subport, uint32_t base_pipe)
2250 {
2251         __m128i index = _mm_set1_epi32(base_pipe);
2252         __m128i pipes = _mm_load_si128((__m128i *)subport->grinder_base_bmp_pos);
2253         __m128i res = _mm_cmpeq_epi32(pipes, index);
2254
2255         pipes = _mm_load_si128((__m128i *)(subport->grinder_base_bmp_pos + 4));
2256         pipes = _mm_cmpeq_epi32(pipes, index);
2257         res = _mm_or_si128(res, pipes);
2258
2259         if (_mm_testz_si128(res, res))
2260                 return 0;
2261
2262         return 1;
2263 }
2264
2265 #elif defined(SCHED_VECTOR_NEON)
2266
2267 static inline int
2268 grinder_pipe_exists(struct rte_sched_subport *subport, uint32_t base_pipe)
2269 {
2270         uint32x4_t index, pipes;
2271         uint32_t *pos = (uint32_t *)subport->grinder_base_bmp_pos;
2272
2273         index = vmovq_n_u32(base_pipe);
2274         pipes = vld1q_u32(pos);
2275         if (!vminvq_u32(veorq_u32(pipes, index)))
2276                 return 1;
2277
2278         pipes = vld1q_u32(pos + 4);
2279         if (!vminvq_u32(veorq_u32(pipes, index)))
2280                 return 1;
2281
2282         return 0;
2283 }
2284
2285 #else
2286
2287 static inline int
2288 grinder_pipe_exists(struct rte_sched_subport *subport, uint32_t base_pipe)
2289 {
2290         uint32_t i;
2291
2292         for (i = 0; i < RTE_SCHED_PORT_N_GRINDERS; i++) {
2293                 if (subport->grinder_base_bmp_pos[i] == base_pipe)
2294                         return 1;
2295         }
2296
2297         return 0;
2298 }
2299
2300 #endif /* RTE_SCHED_OPTIMIZATIONS */
2301
2302 static inline void
2303 grinder_pcache_populate(struct rte_sched_subport *subport,
2304         uint32_t pos, uint32_t bmp_pos, uint64_t bmp_slab)
2305 {
2306         struct rte_sched_grinder *grinder = subport->grinder + pos;
2307         uint16_t w[4];
2308
2309         grinder->pcache_w = 0;
2310         grinder->pcache_r = 0;
2311
2312         w[0] = (uint16_t) bmp_slab;
2313         w[1] = (uint16_t) (bmp_slab >> 16);
2314         w[2] = (uint16_t) (bmp_slab >> 32);
2315         w[3] = (uint16_t) (bmp_slab >> 48);
2316
2317         grinder->pcache_qmask[grinder->pcache_w] = w[0];
2318         grinder->pcache_qindex[grinder->pcache_w] = bmp_pos;
2319         grinder->pcache_w += (w[0] != 0);
2320
2321         grinder->pcache_qmask[grinder->pcache_w] = w[1];
2322         grinder->pcache_qindex[grinder->pcache_w] = bmp_pos + 16;
2323         grinder->pcache_w += (w[1] != 0);
2324
2325         grinder->pcache_qmask[grinder->pcache_w] = w[2];
2326         grinder->pcache_qindex[grinder->pcache_w] = bmp_pos + 32;
2327         grinder->pcache_w += (w[2] != 0);
2328
2329         grinder->pcache_qmask[grinder->pcache_w] = w[3];
2330         grinder->pcache_qindex[grinder->pcache_w] = bmp_pos + 48;
2331         grinder->pcache_w += (w[3] != 0);
2332 }
2333
2334 static inline void
2335 grinder_tccache_populate(struct rte_sched_subport *subport,
2336         uint32_t pos, uint32_t qindex, uint16_t qmask)
2337 {
2338         struct rte_sched_grinder *grinder = subport->grinder + pos;
2339         uint8_t b, i;
2340
2341         grinder->tccache_w = 0;
2342         grinder->tccache_r = 0;
2343
2344         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASS_BE; i++) {
2345                 b = (uint8_t) ((qmask >> i) & 0x1);
2346                 grinder->tccache_qmask[grinder->tccache_w] = b;
2347                 grinder->tccache_qindex[grinder->tccache_w] = qindex + i;
2348                 grinder->tccache_w += (b != 0);
2349         }
2350
2351         b = (uint8_t) (qmask >> (RTE_SCHED_TRAFFIC_CLASS_BE));
2352         grinder->tccache_qmask[grinder->tccache_w] = b;
2353         grinder->tccache_qindex[grinder->tccache_w] = qindex +
2354                 RTE_SCHED_TRAFFIC_CLASS_BE;
2355         grinder->tccache_w += (b != 0);
2356 }
2357
2358 static inline int
2359 grinder_next_tc(struct rte_sched_port *port,
2360         struct rte_sched_subport *subport, uint32_t pos)
2361 {
2362         struct rte_sched_grinder *grinder = subport->grinder + pos;
2363         struct rte_mbuf **qbase;
2364         uint32_t qindex;
2365         uint16_t qsize;
2366
2367         if (grinder->tccache_r == grinder->tccache_w)
2368                 return 0;
2369
2370         qindex = grinder->tccache_qindex[grinder->tccache_r];
2371         qbase = rte_sched_subport_pipe_qbase(subport, qindex);
2372         qsize = rte_sched_subport_pipe_qsize(port, subport, qindex);
2373
2374         grinder->tc_index = rte_sched_port_pipe_tc(port, qindex);
2375         grinder->qmask = grinder->tccache_qmask[grinder->tccache_r];
2376         grinder->qsize = qsize;
2377
2378         if (grinder->tc_index < RTE_SCHED_TRAFFIC_CLASS_BE) {
2379                 grinder->queue[0] = subport->queue + qindex;
2380                 grinder->qbase[0] = qbase;
2381                 grinder->qindex[0] = qindex;
2382                 grinder->tccache_r++;
2383
2384                 return 1;
2385         }
2386
2387         grinder->queue[0] = subport->queue + qindex;
2388         grinder->queue[1] = subport->queue + qindex + 1;
2389         grinder->queue[2] = subport->queue + qindex + 2;
2390         grinder->queue[3] = subport->queue + qindex + 3;
2391
2392         grinder->qbase[0] = qbase;
2393         grinder->qbase[1] = qbase + qsize;
2394         grinder->qbase[2] = qbase + 2 * qsize;
2395         grinder->qbase[3] = qbase + 3 * qsize;
2396
2397         grinder->qindex[0] = qindex;
2398         grinder->qindex[1] = qindex + 1;
2399         grinder->qindex[2] = qindex + 2;
2400         grinder->qindex[3] = qindex + 3;
2401
2402         grinder->tccache_r++;
2403         return 1;
2404 }
2405
2406 static inline int
2407 grinder_next_pipe(struct rte_sched_port *port,
2408         struct rte_sched_subport *subport, uint32_t pos)
2409 {
2410         struct rte_sched_grinder *grinder = subport->grinder + pos;
2411         uint32_t pipe_qindex;
2412         uint16_t pipe_qmask;
2413
2414         if (grinder->pcache_r < grinder->pcache_w) {
2415                 pipe_qmask = grinder->pcache_qmask[grinder->pcache_r];
2416                 pipe_qindex = grinder->pcache_qindex[grinder->pcache_r];
2417                 grinder->pcache_r++;
2418         } else {
2419                 uint64_t bmp_slab = 0;
2420                 uint32_t bmp_pos = 0;
2421
2422                 /* Get another non-empty pipe group */
2423                 if (unlikely(rte_bitmap_scan(subport->bmp, &bmp_pos, &bmp_slab) <= 0))
2424                         return 0;
2425
2426 #ifdef RTE_SCHED_DEBUG
2427                 debug_check_queue_slab(subport, bmp_pos, bmp_slab);
2428 #endif
2429
2430                 /* Return if pipe group already in one of the other grinders */
2431                 subport->grinder_base_bmp_pos[pos] = RTE_SCHED_BMP_POS_INVALID;
2432                 if (unlikely(grinder_pipe_exists(subport, bmp_pos)))
2433                         return 0;
2434
2435                 subport->grinder_base_bmp_pos[pos] = bmp_pos;
2436
2437                 /* Install new pipe group into grinder's pipe cache */
2438                 grinder_pcache_populate(subport, pos, bmp_pos, bmp_slab);
2439
2440                 pipe_qmask = grinder->pcache_qmask[0];
2441                 pipe_qindex = grinder->pcache_qindex[0];
2442                 grinder->pcache_r = 1;
2443         }
2444
2445         /* Install new pipe in the grinder */
2446         grinder->pindex = pipe_qindex >> 4;
2447         grinder->subport = subport;
2448         grinder->pipe = subport->pipe + grinder->pindex;
2449         grinder->pipe_params = NULL; /* to be set after the pipe structure is prefetched */
2450         grinder->productive = 0;
2451
2452         grinder_tccache_populate(subport, pos, pipe_qindex, pipe_qmask);
2453         grinder_next_tc(port, subport, pos);
2454
2455         /* Check for pipe exhaustion */
2456         if (grinder->pindex == subport->pipe_loop) {
2457                 subport->pipe_exhaustion = 1;
2458                 subport->pipe_loop = RTE_SCHED_PIPE_INVALID;
2459         }
2460
2461         return 1;
2462 }
2463
2464
2465 static inline void
2466 grinder_wrr_load(struct rte_sched_subport *subport, uint32_t pos)
2467 {
2468         struct rte_sched_grinder *grinder = subport->grinder + pos;
2469         struct rte_sched_pipe *pipe = grinder->pipe;
2470         struct rte_sched_pipe_profile *pipe_params = grinder->pipe_params;
2471         uint32_t qmask = grinder->qmask;
2472
2473         grinder->wrr_tokens[0] =
2474                 ((uint16_t) pipe->wrr_tokens[0]) << RTE_SCHED_WRR_SHIFT;
2475         grinder->wrr_tokens[1] =
2476                 ((uint16_t) pipe->wrr_tokens[1]) << RTE_SCHED_WRR_SHIFT;
2477         grinder->wrr_tokens[2] =
2478                 ((uint16_t) pipe->wrr_tokens[2]) << RTE_SCHED_WRR_SHIFT;
2479         grinder->wrr_tokens[3] =
2480                 ((uint16_t) pipe->wrr_tokens[3]) << RTE_SCHED_WRR_SHIFT;
2481
2482         grinder->wrr_mask[0] = (qmask & 0x1) * 0xFFFF;
2483         grinder->wrr_mask[1] = ((qmask >> 1) & 0x1) * 0xFFFF;
2484         grinder->wrr_mask[2] = ((qmask >> 2) & 0x1) * 0xFFFF;
2485         grinder->wrr_mask[3] = ((qmask >> 3) & 0x1) * 0xFFFF;
2486
2487         grinder->wrr_cost[0] = pipe_params->wrr_cost[0];
2488         grinder->wrr_cost[1] = pipe_params->wrr_cost[1];
2489         grinder->wrr_cost[2] = pipe_params->wrr_cost[2];
2490         grinder->wrr_cost[3] = pipe_params->wrr_cost[3];
2491 }
2492
2493 static inline void
2494 grinder_wrr_store(struct rte_sched_subport *subport, uint32_t pos)
2495 {
2496         struct rte_sched_grinder *grinder = subport->grinder + pos;
2497         struct rte_sched_pipe *pipe = grinder->pipe;
2498
2499         pipe->wrr_tokens[0] =
2500                         (grinder->wrr_tokens[0] & grinder->wrr_mask[0]) >>
2501                                 RTE_SCHED_WRR_SHIFT;
2502         pipe->wrr_tokens[1] =
2503                         (grinder->wrr_tokens[1] & grinder->wrr_mask[1]) >>
2504                                 RTE_SCHED_WRR_SHIFT;
2505         pipe->wrr_tokens[2] =
2506                         (grinder->wrr_tokens[2] & grinder->wrr_mask[2]) >>
2507                                 RTE_SCHED_WRR_SHIFT;
2508         pipe->wrr_tokens[3] =
2509                         (grinder->wrr_tokens[3] & grinder->wrr_mask[3]) >>
2510                                 RTE_SCHED_WRR_SHIFT;
2511 }
2512
2513 static inline void
2514 grinder_wrr(struct rte_sched_subport *subport, uint32_t pos)
2515 {
2516         struct rte_sched_grinder *grinder = subport->grinder + pos;
2517         uint16_t wrr_tokens_min;
2518
2519         grinder->wrr_tokens[0] |= ~grinder->wrr_mask[0];
2520         grinder->wrr_tokens[1] |= ~grinder->wrr_mask[1];
2521         grinder->wrr_tokens[2] |= ~grinder->wrr_mask[2];
2522         grinder->wrr_tokens[3] |= ~grinder->wrr_mask[3];
2523
2524         grinder->qpos = rte_min_pos_4_u16(grinder->wrr_tokens);
2525         wrr_tokens_min = grinder->wrr_tokens[grinder->qpos];
2526
2527         grinder->wrr_tokens[0] -= wrr_tokens_min;
2528         grinder->wrr_tokens[1] -= wrr_tokens_min;
2529         grinder->wrr_tokens[2] -= wrr_tokens_min;
2530         grinder->wrr_tokens[3] -= wrr_tokens_min;
2531 }
2532
2533
2534 #define grinder_evict(subport, pos)
2535
2536 static inline void
2537 grinder_prefetch_pipe(struct rte_sched_subport *subport, uint32_t pos)
2538 {
2539         struct rte_sched_grinder *grinder = subport->grinder + pos;
2540
2541         rte_prefetch0(grinder->pipe);
2542         rte_prefetch0(grinder->queue[0]);
2543 }
2544
2545 static inline void
2546 grinder_prefetch_tc_queue_arrays(struct rte_sched_subport *subport, uint32_t pos)
2547 {
2548         struct rte_sched_grinder *grinder = subport->grinder + pos;
2549         uint16_t qsize, qr[RTE_SCHED_MAX_QUEUES_PER_TC];
2550
2551         qsize = grinder->qsize;
2552         grinder->qpos = 0;
2553
2554         if (grinder->tc_index < RTE_SCHED_TRAFFIC_CLASS_BE) {
2555                 qr[0] = grinder->queue[0]->qr & (qsize - 1);
2556
2557                 rte_prefetch0(grinder->qbase[0] + qr[0]);
2558                 return;
2559         }
2560
2561         qr[0] = grinder->queue[0]->qr & (qsize - 1);
2562         qr[1] = grinder->queue[1]->qr & (qsize - 1);
2563         qr[2] = grinder->queue[2]->qr & (qsize - 1);
2564         qr[3] = grinder->queue[3]->qr & (qsize - 1);
2565
2566         rte_prefetch0(grinder->qbase[0] + qr[0]);
2567         rte_prefetch0(grinder->qbase[1] + qr[1]);
2568
2569         grinder_wrr_load(subport, pos);
2570         grinder_wrr(subport, pos);
2571
2572         rte_prefetch0(grinder->qbase[2] + qr[2]);
2573         rte_prefetch0(grinder->qbase[3] + qr[3]);
2574 }
2575
2576 static inline void
2577 grinder_prefetch_mbuf(struct rte_sched_subport *subport, uint32_t pos)
2578 {
2579         struct rte_sched_grinder *grinder = subport->grinder + pos;
2580         uint32_t qpos = grinder->qpos;
2581         struct rte_mbuf **qbase = grinder->qbase[qpos];
2582         uint16_t qsize = grinder->qsize;
2583         uint16_t qr = grinder->queue[qpos]->qr & (qsize - 1);
2584
2585         grinder->pkt = qbase[qr];
2586         rte_prefetch0(grinder->pkt);
2587
2588         if (unlikely((qr & 0x7) == 7)) {
2589                 uint16_t qr_next = (grinder->queue[qpos]->qr + 1) & (qsize - 1);
2590
2591                 rte_prefetch0(qbase + qr_next);
2592         }
2593 }
2594
2595 static inline uint32_t
2596 grinder_handle(struct rte_sched_port *port, uint32_t pos)
2597 {
2598         struct rte_sched_subport *subport = port->subport;
2599         struct rte_sched_grinder *grinder = subport->grinder + pos;
2600
2601         switch (grinder->state) {
2602         case e_GRINDER_PREFETCH_PIPE:
2603         {
2604                 if (grinder_next_pipe(port, subport, pos)) {
2605                         grinder_prefetch_pipe(subport, pos);
2606                         subport->busy_grinders++;
2607
2608                         grinder->state = e_GRINDER_PREFETCH_TC_QUEUE_ARRAYS;
2609                         return 0;
2610                 }
2611
2612                 return 0;
2613         }
2614
2615         case e_GRINDER_PREFETCH_TC_QUEUE_ARRAYS:
2616         {
2617                 struct rte_sched_pipe *pipe = grinder->pipe;
2618
2619                 grinder->pipe_params = subport->pipe_profiles + pipe->profile;
2620                 grinder_prefetch_tc_queue_arrays(subport, pos);
2621                 grinder_credits_update(port, subport, pos);
2622
2623                 grinder->state = e_GRINDER_PREFETCH_MBUF;
2624                 return 0;
2625         }
2626
2627         case e_GRINDER_PREFETCH_MBUF:
2628         {
2629                 grinder_prefetch_mbuf(subport, pos);
2630
2631                 grinder->state = e_GRINDER_READ_MBUF;
2632                 return 0;
2633         }
2634
2635         case e_GRINDER_READ_MBUF:
2636         {
2637                 uint32_t wrr_active, result = 0;
2638
2639                 result = grinder_schedule(port, subport, pos);
2640
2641                 wrr_active = (grinder->tc_index == RTE_SCHED_TRAFFIC_CLASS_BE);
2642
2643                 /* Look for next packet within the same TC */
2644                 if (result && grinder->qmask) {
2645                         if (wrr_active)
2646                                 grinder_wrr(subport, pos);
2647
2648                         grinder_prefetch_mbuf(subport, pos);
2649
2650                         return 1;
2651                 }
2652
2653                 if (wrr_active)
2654                         grinder_wrr_store(subport, pos);
2655
2656                 /* Look for another active TC within same pipe */
2657                 if (grinder_next_tc(port, subport, pos)) {
2658                         grinder_prefetch_tc_queue_arrays(subport, pos);
2659
2660                         grinder->state = e_GRINDER_PREFETCH_MBUF;
2661                         return result;
2662                 }
2663
2664                 if (grinder->productive == 0 &&
2665                     subport->pipe_loop == RTE_SCHED_PIPE_INVALID)
2666                         subport->pipe_loop = grinder->pindex;
2667
2668                 grinder_evict(subport, pos);
2669
2670                 /* Look for another active pipe */
2671                 if (grinder_next_pipe(port, subport, pos)) {
2672                         grinder_prefetch_pipe(subport, pos);
2673
2674                         grinder->state = e_GRINDER_PREFETCH_TC_QUEUE_ARRAYS;
2675                         return result;
2676                 }
2677
2678                 /* No active pipe found */
2679                 subport->busy_grinders--;
2680
2681                 grinder->state = e_GRINDER_PREFETCH_PIPE;
2682                 return result;
2683         }
2684
2685         default:
2686                 rte_panic("Algorithmic error (invalid state)\n");
2687                 return 0;
2688         }
2689 }
2690
2691 static inline void
2692 rte_sched_port_time_resync(struct rte_sched_port *port)
2693 {
2694         uint64_t cycles = rte_get_tsc_cycles();
2695         uint64_t cycles_diff = cycles - port->time_cpu_cycles;
2696         uint64_t bytes_diff;
2697
2698         /* Compute elapsed time in bytes */
2699         bytes_diff = rte_reciprocal_divide(cycles_diff << RTE_SCHED_TIME_SHIFT,
2700                                            port->inv_cycles_per_byte);
2701
2702         /* Advance port time */
2703         port->time_cpu_cycles = cycles;
2704         port->time_cpu_bytes += bytes_diff;
2705         if (port->time < port->time_cpu_bytes)
2706                 port->time = port->time_cpu_bytes;
2707
2708         /* Reset pipe loop detection */
2709         port->pipe_loop = RTE_SCHED_PIPE_INVALID;
2710 }
2711
2712 static inline int
2713 rte_sched_port_exceptions(struct rte_sched_port *port, int second_pass)
2714 {
2715         int exceptions;
2716
2717         /* Check if any exception flag is set */
2718         exceptions = (second_pass && port->busy_grinders == 0) ||
2719                 (port->pipe_exhaustion == 1);
2720
2721         /* Clear exception flags */
2722         port->pipe_exhaustion = 0;
2723
2724         return exceptions;
2725 }
2726
2727 int
2728 rte_sched_port_dequeue(struct rte_sched_port *port, struct rte_mbuf **pkts, uint32_t n_pkts)
2729 {
2730         uint32_t i, count;
2731
2732         port->pkts_out = pkts;
2733         port->n_pkts_out = 0;
2734
2735         rte_sched_port_time_resync(port);
2736
2737         /* Take each queue in the grinder one step further */
2738         for (i = 0, count = 0; ; i++)  {
2739                 count += grinder_handle(port, i & (RTE_SCHED_PORT_N_GRINDERS - 1));
2740                 if ((count == n_pkts) ||
2741                     rte_sched_port_exceptions(port, i >= RTE_SCHED_PORT_N_GRINDERS)) {
2742                         break;
2743                 }
2744         }
2745
2746         return count;
2747 }