sched: update pkt dequeue for flexible config
[dpdk.git] / lib / librte_sched / rte_sched.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4
5 #include <stdio.h>
6 #include <string.h>
7
8 #include <rte_common.h>
9 #include <rte_log.h>
10 #include <rte_memory.h>
11 #include <rte_malloc.h>
12 #include <rte_cycles.h>
13 #include <rte_prefetch.h>
14 #include <rte_branch_prediction.h>
15 #include <rte_mbuf.h>
16 #include <rte_bitmap.h>
17 #include <rte_reciprocal.h>
18
19 #include "rte_sched.h"
20 #include "rte_sched_common.h"
21 #include "rte_approx.h"
22
23 #ifdef __INTEL_COMPILER
24 #pragma warning(disable:2259) /* conversion may lose significant bits */
25 #endif
26
27 #ifdef RTE_SCHED_VECTOR
28 #include <rte_vect.h>
29
30 #ifdef RTE_ARCH_X86
31 #define SCHED_VECTOR_SSE4
32 #elif defined(RTE_MACHINE_CPUFLAG_NEON)
33 #define SCHED_VECTOR_NEON
34 #endif
35
36 #endif
37
38 #define RTE_SCHED_TB_RATE_CONFIG_ERR          (1e-7)
39 #define RTE_SCHED_WRR_SHIFT                   3
40 #define RTE_SCHED_MAX_QUEUES_PER_TC           RTE_SCHED_BE_QUEUES_PER_PIPE
41 #define RTE_SCHED_GRINDER_PCACHE_SIZE         (64 / RTE_SCHED_QUEUES_PER_PIPE)
42 #define RTE_SCHED_PIPE_INVALID                UINT32_MAX
43 #define RTE_SCHED_BMP_POS_INVALID             UINT32_MAX
44
45 /* Scaling for cycles_per_byte calculation
46  * Chosen so that minimum rate is 480 bit/sec
47  */
48 #define RTE_SCHED_TIME_SHIFT                  8
49
50 struct rte_sched_pipe_profile {
51         /* Token bucket (TB) */
52         uint32_t tb_period;
53         uint32_t tb_credits_per_period;
54         uint32_t tb_size;
55
56         /* Pipe traffic classes */
57         uint32_t tc_period;
58         uint32_t tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
59         uint8_t tc_ov_weight;
60
61         /* Pipe best-effort traffic class queues */
62         uint8_t  wrr_cost[RTE_SCHED_BE_QUEUES_PER_PIPE];
63 };
64
65 struct rte_sched_pipe {
66         /* Token bucket (TB) */
67         uint64_t tb_time; /* time of last update */
68         uint32_t tb_credits;
69
70         /* Pipe profile and flags */
71         uint32_t profile;
72
73         /* Traffic classes (TCs) */
74         uint64_t tc_time; /* time of next update */
75         uint32_t tc_credits[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
76
77         /* Weighted Round Robin (WRR) */
78         uint8_t wrr_tokens[RTE_SCHED_BE_QUEUES_PER_PIPE];
79
80         /* TC oversubscription */
81         uint32_t tc_ov_credits;
82         uint8_t tc_ov_period_id;
83 } __rte_cache_aligned;
84
85 struct rte_sched_queue {
86         uint16_t qw;
87         uint16_t qr;
88 };
89
90 struct rte_sched_queue_extra {
91         struct rte_sched_queue_stats stats;
92 #ifdef RTE_SCHED_RED
93         struct rte_red red;
94 #endif
95 };
96
97 enum grinder_state {
98         e_GRINDER_PREFETCH_PIPE = 0,
99         e_GRINDER_PREFETCH_TC_QUEUE_ARRAYS,
100         e_GRINDER_PREFETCH_MBUF,
101         e_GRINDER_READ_MBUF
102 };
103
104 struct rte_sched_grinder {
105         /* Pipe cache */
106         uint16_t pcache_qmask[RTE_SCHED_GRINDER_PCACHE_SIZE];
107         uint32_t pcache_qindex[RTE_SCHED_GRINDER_PCACHE_SIZE];
108         uint32_t pcache_w;
109         uint32_t pcache_r;
110
111         /* Current pipe */
112         enum grinder_state state;
113         uint32_t productive;
114         uint32_t pindex;
115         struct rte_sched_subport *subport;
116         struct rte_sched_pipe *pipe;
117         struct rte_sched_pipe_profile *pipe_params;
118
119         /* TC cache */
120         uint8_t tccache_qmask[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
121         uint32_t tccache_qindex[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
122         uint32_t tccache_w;
123         uint32_t tccache_r;
124
125         /* Current TC */
126         uint32_t tc_index;
127         struct rte_sched_queue *queue[RTE_SCHED_MAX_QUEUES_PER_TC];
128         struct rte_mbuf **qbase[RTE_SCHED_MAX_QUEUES_PER_TC];
129         uint32_t qindex[RTE_SCHED_MAX_QUEUES_PER_TC];
130         uint16_t qsize;
131         uint32_t qmask;
132         uint32_t qpos;
133         struct rte_mbuf *pkt;
134
135         /* WRR */
136         uint16_t wrr_tokens[RTE_SCHED_BE_QUEUES_PER_PIPE];
137         uint16_t wrr_mask[RTE_SCHED_BE_QUEUES_PER_PIPE];
138         uint8_t wrr_cost[RTE_SCHED_BE_QUEUES_PER_PIPE];
139 };
140
141 struct rte_sched_subport {
142         /* Token bucket (TB) */
143         uint64_t tb_time; /* time of last update */
144         uint32_t tb_period;
145         uint32_t tb_credits_per_period;
146         uint32_t tb_size;
147         uint32_t tb_credits;
148
149         /* Traffic classes (TCs) */
150         uint64_t tc_time; /* time of next update */
151         uint32_t tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
152         uint32_t tc_credits[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
153         uint32_t tc_period;
154
155         /* TC oversubscription */
156         uint32_t tc_ov_wm;
157         uint32_t tc_ov_wm_min;
158         uint32_t tc_ov_wm_max;
159         uint8_t tc_ov_period_id;
160         uint8_t tc_ov;
161         uint32_t tc_ov_n;
162         double tc_ov_rate;
163
164         /* Statistics */
165         struct rte_sched_subport_stats stats;
166
167         /* Subport pipes */
168         uint32_t n_pipes_per_subport_enabled;
169         uint32_t n_pipe_profiles;
170         uint32_t n_max_pipe_profiles;
171
172         /* Pipe best-effort TC rate */
173         uint32_t pipe_tc_be_rate_max;
174
175         /* Pipe queues size */
176         uint16_t qsize[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
177
178 #ifdef RTE_SCHED_RED
179         struct rte_red_config red_config[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE][RTE_COLORS];
180 #endif
181
182         /* Scheduling loop detection */
183         uint32_t pipe_loop;
184         uint32_t pipe_exhaustion;
185
186         /* Bitmap */
187         struct rte_bitmap *bmp;
188         uint32_t grinder_base_bmp_pos[RTE_SCHED_PORT_N_GRINDERS] __rte_aligned_16;
189
190         /* Grinders */
191         struct rte_sched_grinder grinder[RTE_SCHED_PORT_N_GRINDERS];
192         uint32_t busy_grinders;
193
194         /* Queue base calculation */
195         uint32_t qsize_add[RTE_SCHED_QUEUES_PER_PIPE];
196         uint32_t qsize_sum;
197
198         struct rte_sched_pipe *pipe;
199         struct rte_sched_queue *queue;
200         struct rte_sched_queue_extra *queue_extra;
201         struct rte_sched_pipe_profile *pipe_profiles;
202         uint8_t *bmp_array;
203         struct rte_mbuf **queue_array;
204         uint8_t memory[0] __rte_cache_aligned;
205 } __rte_cache_aligned;
206
207 struct rte_sched_port {
208         /* User parameters */
209         uint32_t n_subports_per_port;
210         uint32_t n_pipes_per_subport;
211         uint32_t n_pipes_per_subport_log2;
212         uint16_t pipe_queue[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
213         uint8_t pipe_tc[RTE_SCHED_QUEUES_PER_PIPE];
214         uint8_t tc_queue[RTE_SCHED_QUEUES_PER_PIPE];
215         uint32_t rate;
216         uint32_t mtu;
217         uint32_t frame_overhead;
218         int socket;
219         uint16_t qsize[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
220         uint32_t n_pipe_profiles;
221         uint32_t n_max_pipe_profiles;
222         uint32_t pipe_tc_be_rate_max;
223 #ifdef RTE_SCHED_RED
224         struct rte_red_config red_config[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE][RTE_COLORS];
225 #endif
226
227         /* Timing */
228         uint64_t time_cpu_cycles;     /* Current CPU time measured in CPU cyles */
229         uint64_t time_cpu_bytes;      /* Current CPU time measured in bytes */
230         uint64_t time;                /* Current NIC TX time measured in bytes */
231         struct rte_reciprocal inv_cycles_per_byte; /* CPU cycles per byte */
232
233         /* Scheduling loop detection */
234         uint32_t pipe_loop;
235         uint32_t pipe_exhaustion;
236
237         /* Bitmap */
238         struct rte_bitmap *bmp;
239         uint32_t grinder_base_bmp_pos[RTE_SCHED_PORT_N_GRINDERS] __rte_aligned_16;
240
241         /* Grinders */
242         struct rte_sched_grinder grinder[RTE_SCHED_PORT_N_GRINDERS];
243         uint32_t busy_grinders;
244         struct rte_mbuf **pkts_out;
245         uint32_t n_pkts_out;
246         uint32_t subport_id;
247
248         /* Queue base calculation */
249         uint32_t qsize_add[RTE_SCHED_QUEUES_PER_PIPE];
250         uint32_t qsize_sum;
251
252         /* Large data structures */
253         struct rte_sched_subport *subports[0];
254         struct rte_sched_subport *subport;
255         struct rte_sched_pipe *pipe;
256         struct rte_sched_queue *queue;
257         struct rte_sched_queue_extra *queue_extra;
258         struct rte_sched_pipe_profile *pipe_profiles;
259         uint8_t *bmp_array;
260         struct rte_mbuf **queue_array;
261         uint8_t memory[0] __rte_cache_aligned;
262 } __rte_cache_aligned;
263
264 enum rte_sched_port_array {
265         e_RTE_SCHED_PORT_ARRAY_SUBPORT = 0,
266         e_RTE_SCHED_PORT_ARRAY_PIPE,
267         e_RTE_SCHED_PORT_ARRAY_QUEUE,
268         e_RTE_SCHED_PORT_ARRAY_QUEUE_EXTRA,
269         e_RTE_SCHED_PORT_ARRAY_PIPE_PROFILES,
270         e_RTE_SCHED_PORT_ARRAY_BMP_ARRAY,
271         e_RTE_SCHED_PORT_ARRAY_QUEUE_ARRAY,
272         e_RTE_SCHED_PORT_ARRAY_TOTAL,
273 };
274
275 enum rte_sched_subport_array {
276         e_RTE_SCHED_SUBPORT_ARRAY_PIPE = 0,
277         e_RTE_SCHED_SUBPORT_ARRAY_QUEUE,
278         e_RTE_SCHED_SUBPORT_ARRAY_QUEUE_EXTRA,
279         e_RTE_SCHED_SUBPORT_ARRAY_PIPE_PROFILES,
280         e_RTE_SCHED_SUBPORT_ARRAY_BMP_ARRAY,
281         e_RTE_SCHED_SUBPORT_ARRAY_QUEUE_ARRAY,
282         e_RTE_SCHED_SUBPORT_ARRAY_TOTAL,
283 };
284
285 static inline uint32_t
286 rte_sched_subport_pipe_queues(struct rte_sched_subport *subport)
287 {
288         return RTE_SCHED_QUEUES_PER_PIPE * subport->n_pipes_per_subport_enabled;
289 }
290
291 static inline struct rte_mbuf **
292 rte_sched_subport_pipe_qbase(struct rte_sched_subport *subport, uint32_t qindex)
293 {
294         uint32_t pindex = qindex >> 4;
295         uint32_t qpos = qindex & (RTE_SCHED_QUEUES_PER_PIPE - 1);
296
297         return (subport->queue_array + pindex *
298                 subport->qsize_sum + subport->qsize_add[qpos]);
299 }
300
301 static inline uint16_t
302 rte_sched_subport_pipe_qsize(struct rte_sched_port *port,
303 struct rte_sched_subport *subport, uint32_t qindex)
304 {
305         uint32_t tc = port->pipe_tc[qindex & (RTE_SCHED_QUEUES_PER_PIPE - 1)];
306
307         return subport->qsize[tc];
308 }
309
310 static inline uint32_t
311 rte_sched_port_queues_per_port(struct rte_sched_port *port)
312 {
313         return RTE_SCHED_QUEUES_PER_PIPE * port->n_pipes_per_subport * port->n_subports_per_port;
314 }
315
316 static inline uint16_t
317 rte_sched_port_pipe_queue(struct rte_sched_port *port, uint32_t traffic_class)
318 {
319         uint16_t pipe_queue = port->pipe_queue[traffic_class];
320
321         return pipe_queue;
322 }
323
324 static inline uint8_t
325 rte_sched_port_pipe_tc(struct rte_sched_port *port, uint32_t qindex)
326 {
327         uint8_t pipe_tc = port->pipe_tc[qindex & (RTE_SCHED_QUEUES_PER_PIPE - 1)];
328
329         return pipe_tc;
330 }
331
332 static inline uint8_t
333 rte_sched_port_tc_queue(struct rte_sched_port *port, uint32_t qindex)
334 {
335         uint8_t tc_queue = port->tc_queue[qindex & (RTE_SCHED_QUEUES_PER_PIPE - 1)];
336
337         return tc_queue;
338 }
339
340 static int
341 pipe_profile_check(struct rte_sched_pipe_params *params,
342         uint32_t rate, uint16_t *qsize)
343 {
344         uint32_t i;
345
346         /* Pipe parameters */
347         if (params == NULL) {
348                 RTE_LOG(ERR, SCHED,
349                         "%s: Incorrect value for parameter params\n", __func__);
350                 return -EINVAL;
351         }
352
353         /* TB rate: non-zero, not greater than port rate */
354         if (params->tb_rate == 0 ||
355                 params->tb_rate > rate) {
356                 RTE_LOG(ERR, SCHED,
357                         "%s: Incorrect value for tb rate\n", __func__);
358                 return -EINVAL;
359         }
360
361         /* TB size: non-zero */
362         if (params->tb_size == 0) {
363                 RTE_LOG(ERR, SCHED,
364                         "%s: Incorrect value for tb size\n", __func__);
365                 return -EINVAL;
366         }
367
368         /* TC rate: non-zero if qsize non-zero, less than pipe rate */
369         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) {
370                 if ((qsize[i] == 0 && params->tc_rate[i] != 0) ||
371                         (qsize[i] != 0 && (params->tc_rate[i] == 0 ||
372                         params->tc_rate[i] > params->tb_rate))) {
373                         RTE_LOG(ERR, SCHED,
374                                 "%s: Incorrect value for qsize or tc_rate\n", __func__);
375                         return -EINVAL;
376                 }
377         }
378
379         if (params->tc_rate[RTE_SCHED_TRAFFIC_CLASS_BE] == 0 ||
380                 qsize[RTE_SCHED_TRAFFIC_CLASS_BE] == 0) {
381                 RTE_LOG(ERR, SCHED,
382                         "%s: Incorrect value for be traffic class rate\n", __func__);
383                 return -EINVAL;
384         }
385
386         /* TC period: non-zero */
387         if (params->tc_period == 0) {
388                 RTE_LOG(ERR, SCHED,
389                         "%s: Incorrect value for tc period\n", __func__);
390                 return -EINVAL;
391         }
392
393         /*  Best effort tc oversubscription weight: non-zero */
394         if (params->tc_ov_weight == 0) {
395                 RTE_LOG(ERR, SCHED,
396                         "%s: Incorrect value for tc ov weight\n", __func__);
397                 return -EINVAL;
398         }
399
400         /* Queue WRR weights: non-zero */
401         for (i = 0; i < RTE_SCHED_BE_QUEUES_PER_PIPE; i++) {
402                 if (params->wrr_weights[i] == 0) {
403                         RTE_LOG(ERR, SCHED,
404                                 "%s: Incorrect value for wrr weight\n", __func__);
405                         return -EINVAL;
406                 }
407         }
408
409         return 0;
410 }
411
412 static int
413 rte_sched_port_check_params(struct rte_sched_port_params *params)
414 {
415         if (params == NULL) {
416                 RTE_LOG(ERR, SCHED,
417                         "%s: Incorrect value for parameter params\n", __func__);
418                 return -EINVAL;
419         }
420
421         /* socket */
422         if (params->socket < 0) {
423                 RTE_LOG(ERR, SCHED,
424                         "%s: Incorrect value for socket id\n", __func__);
425                 return -EINVAL;
426         }
427
428         /* rate */
429         if (params->rate == 0) {
430                 RTE_LOG(ERR, SCHED,
431                         "%s: Incorrect value for rate\n", __func__);
432                 return -EINVAL;
433         }
434
435         /* mtu */
436         if (params->mtu == 0) {
437                 RTE_LOG(ERR, SCHED,
438                         "%s: Incorrect value for mtu\n", __func__);
439                 return -EINVAL;
440         }
441
442         /* n_subports_per_port: non-zero, limited to 16 bits, power of 2 */
443         if (params->n_subports_per_port == 0 ||
444             params->n_subports_per_port > 1u << 16 ||
445             !rte_is_power_of_2(params->n_subports_per_port)) {
446                 RTE_LOG(ERR, SCHED,
447                         "%s: Incorrect value for number of subports\n", __func__);
448                 return -EINVAL;
449         }
450
451         /* n_pipes_per_subport: non-zero, power of 2 */
452         if (params->n_pipes_per_subport == 0 ||
453             !rte_is_power_of_2(params->n_pipes_per_subport)) {
454                 RTE_LOG(ERR, SCHED,
455                         "%s: Incorrect value for maximum pipes number\n", __func__);
456                 return -EINVAL;
457         }
458
459         return 0;
460 }
461
462 static uint32_t
463 rte_sched_subport_get_array_base(struct rte_sched_subport_params *params,
464         enum rte_sched_subport_array array)
465 {
466         uint32_t n_pipes_per_subport = params->n_pipes_per_subport_enabled;
467         uint32_t n_subport_pipe_queues =
468                 RTE_SCHED_QUEUES_PER_PIPE * n_pipes_per_subport;
469
470         uint32_t size_pipe = n_pipes_per_subport * sizeof(struct rte_sched_pipe);
471         uint32_t size_queue =
472                 n_subport_pipe_queues * sizeof(struct rte_sched_queue);
473         uint32_t size_queue_extra
474                 = n_subport_pipe_queues * sizeof(struct rte_sched_queue_extra);
475         uint32_t size_pipe_profiles = params->n_max_pipe_profiles *
476                 sizeof(struct rte_sched_pipe_profile);
477         uint32_t size_bmp_array =
478                 rte_bitmap_get_memory_footprint(n_subport_pipe_queues);
479         uint32_t size_per_pipe_queue_array, size_queue_array;
480
481         uint32_t base, i;
482
483         size_per_pipe_queue_array = 0;
484         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) {
485                 if (i < RTE_SCHED_TRAFFIC_CLASS_BE)
486                         size_per_pipe_queue_array +=
487                                 params->qsize[i] * sizeof(struct rte_mbuf *);
488                 else
489                         size_per_pipe_queue_array += RTE_SCHED_MAX_QUEUES_PER_TC *
490                                 params->qsize[i] * sizeof(struct rte_mbuf *);
491         }
492         size_queue_array = n_pipes_per_subport * size_per_pipe_queue_array;
493
494         base = 0;
495
496         if (array == e_RTE_SCHED_SUBPORT_ARRAY_PIPE)
497                 return base;
498         base += RTE_CACHE_LINE_ROUNDUP(size_pipe);
499
500         if (array == e_RTE_SCHED_SUBPORT_ARRAY_QUEUE)
501                 return base;
502         base += RTE_CACHE_LINE_ROUNDUP(size_queue);
503
504         if (array == e_RTE_SCHED_SUBPORT_ARRAY_QUEUE_EXTRA)
505                 return base;
506         base += RTE_CACHE_LINE_ROUNDUP(size_queue_extra);
507
508         if (array == e_RTE_SCHED_SUBPORT_ARRAY_PIPE_PROFILES)
509                 return base;
510         base += RTE_CACHE_LINE_ROUNDUP(size_pipe_profiles);
511
512         if (array == e_RTE_SCHED_SUBPORT_ARRAY_BMP_ARRAY)
513                 return base;
514         base += RTE_CACHE_LINE_ROUNDUP(size_bmp_array);
515
516         if (array == e_RTE_SCHED_SUBPORT_ARRAY_QUEUE_ARRAY)
517                 return base;
518         base += RTE_CACHE_LINE_ROUNDUP(size_queue_array);
519
520         return base;
521 }
522
523 static void
524 rte_sched_subport_config_qsize(struct rte_sched_subport *subport)
525 {
526         uint32_t i;
527
528         subport->qsize_add[0] = 0;
529
530         /* Strict prority traffic class */
531         for (i = 1; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
532                 subport->qsize_add[i] = subport->qsize_add[i-1] + subport->qsize[i-1];
533
534         /* Best-effort traffic class */
535         subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE + 1] =
536                 subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE] +
537                 subport->qsize[RTE_SCHED_TRAFFIC_CLASS_BE];
538         subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE + 2] =
539                 subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE + 1] +
540                 subport->qsize[RTE_SCHED_TRAFFIC_CLASS_BE];
541         subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE + 3] =
542                 subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE + 2] +
543                 subport->qsize[RTE_SCHED_TRAFFIC_CLASS_BE];
544
545         subport->qsize_sum = subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE + 3] +
546                 subport->qsize[RTE_SCHED_TRAFFIC_CLASS_BE];
547 }
548
549 static void
550 rte_sched_port_log_pipe_profile(struct rte_sched_subport *subport, uint32_t i)
551 {
552         struct rte_sched_pipe_profile *p = subport->pipe_profiles + i;
553
554         RTE_LOG(DEBUG, SCHED, "Low level config for pipe profile %u:\n"
555                 "       Token bucket: period = %u, credits per period = %u, size = %u\n"
556                 "       Traffic classes: period = %u,\n"
557                 "       credits per period = [%u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u]\n"
558                 "       Best-effort traffic class oversubscription: weight = %hhu\n"
559                 "       WRR cost: [%hhu, %hhu, %hhu, %hhu]\n",
560                 i,
561
562                 /* Token bucket */
563                 p->tb_period,
564                 p->tb_credits_per_period,
565                 p->tb_size,
566
567                 /* Traffic classes */
568                 p->tc_period,
569                 p->tc_credits_per_period[0],
570                 p->tc_credits_per_period[1],
571                 p->tc_credits_per_period[2],
572                 p->tc_credits_per_period[3],
573                 p->tc_credits_per_period[4],
574                 p->tc_credits_per_period[5],
575                 p->tc_credits_per_period[6],
576                 p->tc_credits_per_period[7],
577                 p->tc_credits_per_period[8],
578                 p->tc_credits_per_period[9],
579                 p->tc_credits_per_period[10],
580                 p->tc_credits_per_period[11],
581                 p->tc_credits_per_period[12],
582
583                 /* Best-effort traffic class oversubscription */
584                 p->tc_ov_weight,
585
586                 /* WRR */
587                 p->wrr_cost[0], p->wrr_cost[1], p->wrr_cost[2], p->wrr_cost[3]);
588 }
589
590 static inline uint64_t
591 rte_sched_time_ms_to_bytes(uint32_t time_ms, uint32_t rate)
592 {
593         uint64_t time = time_ms;
594
595         time = (time * rate) / 1000;
596
597         return time;
598 }
599
600 static void
601 rte_sched_pipe_profile_convert(struct rte_sched_subport *subport,
602         struct rte_sched_pipe_params *src,
603         struct rte_sched_pipe_profile *dst,
604         uint32_t rate)
605 {
606         uint32_t wrr_cost[RTE_SCHED_BE_QUEUES_PER_PIPE];
607         uint32_t lcd1, lcd2, lcd;
608         uint32_t i;
609
610         /* Token Bucket */
611         if (src->tb_rate == rate) {
612                 dst->tb_credits_per_period = 1;
613                 dst->tb_period = 1;
614         } else {
615                 double tb_rate = (double) src->tb_rate
616                                 / (double) rate;
617                 double d = RTE_SCHED_TB_RATE_CONFIG_ERR;
618
619                 rte_approx(tb_rate, d,
620                         &dst->tb_credits_per_period, &dst->tb_period);
621         }
622
623         dst->tb_size = src->tb_size;
624
625         /* Traffic Classes */
626         dst->tc_period = rte_sched_time_ms_to_bytes(src->tc_period,
627                                                 rate);
628
629         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
630                 if (subport->qsize[i])
631                         dst->tc_credits_per_period[i]
632                                 = rte_sched_time_ms_to_bytes(src->tc_period,
633                                         src->tc_rate[i]);
634
635         dst->tc_ov_weight = src->tc_ov_weight;
636
637         /* WRR queues */
638         wrr_cost[0] = src->wrr_weights[0];
639         wrr_cost[1] = src->wrr_weights[1];
640         wrr_cost[2] = src->wrr_weights[2];
641         wrr_cost[3] = src->wrr_weights[3];
642
643         lcd1 = rte_get_lcd(wrr_cost[0], wrr_cost[1]);
644         lcd2 = rte_get_lcd(wrr_cost[2], wrr_cost[3]);
645         lcd = rte_get_lcd(lcd1, lcd2);
646
647         wrr_cost[0] = lcd / wrr_cost[0];
648         wrr_cost[1] = lcd / wrr_cost[1];
649         wrr_cost[2] = lcd / wrr_cost[2];
650         wrr_cost[3] = lcd / wrr_cost[3];
651
652         dst->wrr_cost[0] = (uint8_t) wrr_cost[0];
653         dst->wrr_cost[1] = (uint8_t) wrr_cost[1];
654         dst->wrr_cost[2] = (uint8_t) wrr_cost[2];
655         dst->wrr_cost[3] = (uint8_t) wrr_cost[3];
656 }
657
658 static void
659 rte_sched_subport_config_pipe_profile_table(struct rte_sched_subport *subport,
660         struct rte_sched_subport_params *params, uint32_t rate)
661 {
662         uint32_t i;
663
664         for (i = 0; i < subport->n_pipe_profiles; i++) {
665                 struct rte_sched_pipe_params *src = params->pipe_profiles + i;
666                 struct rte_sched_pipe_profile *dst = subport->pipe_profiles + i;
667
668                 rte_sched_pipe_profile_convert(subport, src, dst, rate);
669                 rte_sched_port_log_pipe_profile(subport, i);
670         }
671
672         subport->pipe_tc_be_rate_max = 0;
673         for (i = 0; i < subport->n_pipe_profiles; i++) {
674                 struct rte_sched_pipe_params *src = params->pipe_profiles + i;
675                 uint32_t pipe_tc_be_rate = src->tc_rate[RTE_SCHED_TRAFFIC_CLASS_BE];
676
677                 if (subport->pipe_tc_be_rate_max < pipe_tc_be_rate)
678                         subport->pipe_tc_be_rate_max = pipe_tc_be_rate;
679         }
680 }
681
682 static int
683 rte_sched_subport_check_params(struct rte_sched_subport_params *params,
684         uint32_t n_max_pipes_per_subport,
685         uint32_t rate)
686 {
687         uint32_t i;
688
689         /* Check user parameters */
690         if (params == NULL) {
691                 RTE_LOG(ERR, SCHED,
692                         "%s: Incorrect value for parameter params\n", __func__);
693                 return -EINVAL;
694         }
695
696         if (params->tb_rate == 0 || params->tb_rate > rate) {
697                 RTE_LOG(ERR, SCHED,
698                         "%s: Incorrect value for tb rate\n", __func__);
699                 return -EINVAL;
700         }
701
702         if (params->tb_size == 0) {
703                 RTE_LOG(ERR, SCHED,
704                         "%s: Incorrect value for tb size\n", __func__);
705                 return -EINVAL;
706         }
707
708         /* qsize: if non-zero, power of 2,
709          * no bigger than 32K (due to 16-bit read/write pointers)
710          */
711         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) {
712                 uint16_t qsize = params->qsize[i];
713
714                 if (qsize != 0 && !rte_is_power_of_2(qsize)) {
715                         RTE_LOG(ERR, SCHED,
716                                 "%s: Incorrect value for qsize\n", __func__);
717                         return -EINVAL;
718                 }
719         }
720
721         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) {
722                 uint32_t tc_rate = params->tc_rate[i];
723                 uint16_t qsize = params->qsize[i];
724
725                 if ((qsize == 0 && tc_rate != 0) ||
726                         (qsize != 0 && tc_rate == 0) ||
727                         (tc_rate > params->tb_rate)) {
728                         RTE_LOG(ERR, SCHED,
729                                 "%s: Incorrect value for tc rate\n", __func__);
730                         return -EINVAL;
731                 }
732         }
733
734         if (params->qsize[RTE_SCHED_TRAFFIC_CLASS_BE] == 0 ||
735                 params->tc_rate[RTE_SCHED_TRAFFIC_CLASS_BE] == 0) {
736                 RTE_LOG(ERR, SCHED,
737                         "%s: Incorrect qsize or tc rate(best effort)\n", __func__);
738                 return -EINVAL;
739         }
740
741         if (params->tc_period == 0) {
742                 RTE_LOG(ERR, SCHED,
743                         "%s: Incorrect value for tc period\n", __func__);
744                 return -EINVAL;
745         }
746
747         /* n_pipes_per_subport: non-zero, power of 2 */
748         if (params->n_pipes_per_subport_enabled == 0 ||
749                 params->n_pipes_per_subport_enabled > n_max_pipes_per_subport ||
750             !rte_is_power_of_2(params->n_pipes_per_subport_enabled)) {
751                 RTE_LOG(ERR, SCHED,
752                         "%s: Incorrect value for pipes number\n", __func__);
753                 return -EINVAL;
754         }
755
756         /* pipe_profiles and n_pipe_profiles */
757         if (params->pipe_profiles == NULL ||
758             params->n_pipe_profiles == 0 ||
759                 params->n_max_pipe_profiles == 0 ||
760                 params->n_pipe_profiles > params->n_max_pipe_profiles) {
761                 RTE_LOG(ERR, SCHED,
762                         "%s: Incorrect value for pipe profiles\n", __func__);
763                 return -EINVAL;
764         }
765
766         for (i = 0; i < params->n_pipe_profiles; i++) {
767                 struct rte_sched_pipe_params *p = params->pipe_profiles + i;
768                 int status;
769
770                 status = pipe_profile_check(p, rate, &params->qsize[0]);
771                 if (status != 0) {
772                         RTE_LOG(ERR, SCHED,
773                                 "%s: Pipe profile check failed(%d)\n", __func__, status);
774                         return -EINVAL;
775                 }
776         }
777
778         return 0;
779 }
780
781 uint32_t
782 rte_sched_port_get_memory_footprint(struct rte_sched_port_params *port_params,
783         struct rte_sched_subport_params **subport_params)
784 {
785         uint32_t size0 = 0, size1 = 0, i;
786         int status;
787
788         status = rte_sched_port_check_params(port_params);
789         if (status != 0) {
790                 RTE_LOG(ERR, SCHED,
791                         "%s: Port scheduler port params check failed (%d)\n",
792                         __func__, status);
793
794                 return 0;
795         }
796
797         for (i = 0; i < port_params->n_subports_per_port; i++) {
798                 struct rte_sched_subport_params *sp = subport_params[i];
799
800                 status = rte_sched_subport_check_params(sp,
801                                 port_params->n_pipes_per_subport,
802                                 port_params->rate);
803                 if (status != 0) {
804                         RTE_LOG(ERR, SCHED,
805                                 "%s: Port scheduler subport params check failed (%d)\n",
806                                 __func__, status);
807
808                         return 0;
809                 }
810         }
811
812         size0 = sizeof(struct rte_sched_port);
813
814         for (i = 0; i < port_params->n_subports_per_port; i++) {
815                 struct rte_sched_subport_params *sp = subport_params[i];
816
817                 size1 += rte_sched_subport_get_array_base(sp,
818                                         e_RTE_SCHED_SUBPORT_ARRAY_TOTAL);
819         }
820
821         return size0 + size1;
822 }
823
824 struct rte_sched_port *
825 rte_sched_port_config(struct rte_sched_port_params *params)
826 {
827         struct rte_sched_port *port = NULL;
828         uint32_t size0, size1;
829         uint32_t cycles_per_byte;
830         uint32_t i, j;
831         int status;
832
833         status = rte_sched_port_check_params(params);
834         if (status != 0) {
835                 RTE_LOG(ERR, SCHED,
836                         "%s: Port scheduler params check failed (%d)\n",
837                         __func__, status);
838                 return NULL;
839         }
840
841         size0 = sizeof(struct rte_sched_port);
842         size1 = params->n_subports_per_port * sizeof(struct rte_sched_subport *);
843
844         /* Allocate memory to store the data structures */
845         port = rte_zmalloc_socket("qos_params", size0 + size1, RTE_CACHE_LINE_SIZE,
846                 params->socket);
847         if (port == NULL) {
848                 RTE_LOG(ERR, SCHED, "%s: Memory allocation fails\n", __func__);
849
850                 return NULL;
851         }
852
853         /* User parameters */
854         port->n_subports_per_port = params->n_subports_per_port;
855         port->n_pipes_per_subport = params->n_pipes_per_subport;
856         port->n_pipes_per_subport_log2 =
857                         __builtin_ctz(params->n_pipes_per_subport);
858         port->socket = params->socket;
859
860         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
861                 port->pipe_queue[i] = i;
862
863         for (i = 0, j = 0; i < RTE_SCHED_QUEUES_PER_PIPE; i++) {
864                 port->pipe_tc[i] = j;
865
866                 if (j < RTE_SCHED_TRAFFIC_CLASS_BE)
867                         j++;
868         }
869
870         for (i = 0, j = 0; i < RTE_SCHED_QUEUES_PER_PIPE; i++) {
871                 port->tc_queue[i] = j;
872
873                 if (i >= RTE_SCHED_TRAFFIC_CLASS_BE)
874                         j++;
875         }
876         port->rate = params->rate;
877         port->mtu = params->mtu + params->frame_overhead;
878         port->frame_overhead = params->frame_overhead;
879
880         /* Timing */
881         port->time_cpu_cycles = rte_get_tsc_cycles();
882         port->time_cpu_bytes = 0;
883         port->time = 0;
884
885         cycles_per_byte = (rte_get_tsc_hz() << RTE_SCHED_TIME_SHIFT)
886                 / params->rate;
887         port->inv_cycles_per_byte = rte_reciprocal_value(cycles_per_byte);
888
889         /* Grinders */
890         port->pkts_out = NULL;
891         port->n_pkts_out = 0;
892         port->subport_id = 0;
893
894         return port;
895 }
896
897 static inline void
898 rte_sched_subport_free(struct rte_sched_port *port,
899         struct rte_sched_subport *subport)
900 {
901         uint32_t n_subport_pipe_queues;
902         uint32_t qindex;
903
904         if (subport == NULL)
905                 return;
906
907         n_subport_pipe_queues = rte_sched_subport_pipe_queues(subport);
908
909         /* Free enqueued mbufs */
910         for (qindex = 0; qindex < n_subport_pipe_queues; qindex++) {
911                 struct rte_mbuf **mbufs =
912                         rte_sched_subport_pipe_qbase(subport, qindex);
913                 uint16_t qsize = rte_sched_subport_pipe_qsize(port, subport, qindex);
914                 if (qsize != 0) {
915                         struct rte_sched_queue *queue = subport->queue + qindex;
916                         uint16_t qr = queue->qr & (qsize - 1);
917                         uint16_t qw = queue->qw & (qsize - 1);
918
919                         for (; qr != qw; qr = (qr + 1) & (qsize - 1))
920                                 rte_pktmbuf_free(mbufs[qr]);
921                 }
922         }
923
924         rte_bitmap_free(subport->bmp);
925 }
926
927 void
928 rte_sched_port_free(struct rte_sched_port *port)
929 {
930         uint32_t i;
931
932         /* Check user parameters */
933         if (port == NULL)
934                 return;
935
936         for (i = 0; i < port->n_subports_per_port; i++)
937                 rte_sched_subport_free(port, port->subports[i]);
938
939         rte_free(port);
940 }
941
942 static void
943 rte_sched_port_log_subport_config(struct rte_sched_port *port, uint32_t i)
944 {
945         struct rte_sched_subport *s = port->subports[i];
946
947         RTE_LOG(DEBUG, SCHED, "Low level config for subport %u:\n"
948                 "       Token bucket: period = %u, credits per period = %u, size = %u\n"
949                 "       Traffic classes: period = %u\n"
950                 "       credits per period = [%u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u]\n"
951                 "       Best effort traffic class oversubscription: wm min = %u, wm max = %u\n",
952                 i,
953
954                 /* Token bucket */
955                 s->tb_period,
956                 s->tb_credits_per_period,
957                 s->tb_size,
958
959                 /* Traffic classes */
960                 s->tc_period,
961                 s->tc_credits_per_period[0],
962                 s->tc_credits_per_period[1],
963                 s->tc_credits_per_period[2],
964                 s->tc_credits_per_period[3],
965                 s->tc_credits_per_period[4],
966                 s->tc_credits_per_period[5],
967                 s->tc_credits_per_period[6],
968                 s->tc_credits_per_period[7],
969                 s->tc_credits_per_period[8],
970                 s->tc_credits_per_period[9],
971                 s->tc_credits_per_period[10],
972                 s->tc_credits_per_period[11],
973                 s->tc_credits_per_period[12],
974
975                 /* Best effort traffic class oversubscription */
976                 s->tc_ov_wm_min,
977                 s->tc_ov_wm_max);
978 }
979
980 static void
981 rte_sched_free_memory(struct rte_sched_port *port, uint32_t n_subports)
982 {
983         uint32_t i;
984
985         for (i = 0; i < n_subports; i++) {
986                 struct rte_sched_subport *subport = port->subports[i];
987
988                 rte_sched_subport_free(port, subport);
989         }
990
991         rte_free(port);
992 }
993
994 int
995 rte_sched_subport_config(struct rte_sched_port *port,
996         uint32_t subport_id,
997         struct rte_sched_subport_params *params)
998 {
999         struct rte_sched_subport *s = NULL;
1000         uint32_t n_subports = subport_id;
1001         uint32_t n_subport_pipe_queues, i;
1002         uint32_t size0, size1, bmp_mem_size;
1003         int status;
1004
1005         /* Check user parameters */
1006         if (port == NULL) {
1007                 RTE_LOG(ERR, SCHED,
1008                         "%s: Incorrect value for parameter port\n", __func__);
1009                 return 0;
1010         }
1011
1012         if (subport_id >= port->n_subports_per_port) {
1013                 RTE_LOG(ERR, SCHED,
1014                         "%s: Incorrect value for subport id\n", __func__);
1015
1016                 rte_sched_free_memory(port, n_subports);
1017                 return -EINVAL;
1018         }
1019
1020         status = rte_sched_subport_check_params(params,
1021                 port->n_pipes_per_subport,
1022                 port->rate);
1023         if (status != 0) {
1024                 RTE_LOG(NOTICE, SCHED,
1025                         "%s: Port scheduler params check failed (%d)\n",
1026                         __func__, status);
1027
1028                 rte_sched_free_memory(port, n_subports);
1029                 return -EINVAL;
1030         }
1031
1032         /* Determine the amount of memory to allocate */
1033         size0 = sizeof(struct rte_sched_subport);
1034         size1 = rte_sched_subport_get_array_base(params,
1035                                 e_RTE_SCHED_SUBPORT_ARRAY_TOTAL);
1036
1037         /* Allocate memory to store the data structures */
1038         s = rte_zmalloc_socket("subport_params", size0 + size1,
1039                 RTE_CACHE_LINE_SIZE, port->socket);
1040         if (s == NULL) {
1041                 RTE_LOG(ERR, SCHED,
1042                         "%s: Memory allocation fails\n", __func__);
1043
1044                 rte_sched_free_memory(port, n_subports);
1045                 return -ENOMEM;
1046         }
1047
1048         n_subports++;
1049
1050         /* Port */
1051         port->subports[subport_id] = s;
1052
1053         /* Token Bucket (TB) */
1054         if (params->tb_rate == port->rate) {
1055                 s->tb_credits_per_period = 1;
1056                 s->tb_period = 1;
1057         } else {
1058                 double tb_rate = ((double) params->tb_rate) / ((double) port->rate);
1059                 double d = RTE_SCHED_TB_RATE_CONFIG_ERR;
1060
1061                 rte_approx(tb_rate, d, &s->tb_credits_per_period, &s->tb_period);
1062         }
1063
1064         s->tb_size = params->tb_size;
1065         s->tb_time = port->time;
1066         s->tb_credits = s->tb_size / 2;
1067
1068         /* Traffic Classes (TCs) */
1069         s->tc_period = rte_sched_time_ms_to_bytes(params->tc_period, port->rate);
1070         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) {
1071                 if (params->qsize[i])
1072                         s->tc_credits_per_period[i]
1073                                 = rte_sched_time_ms_to_bytes(params->tc_period,
1074                                         params->tc_rate[i]);
1075         }
1076         s->tc_time = port->time + s->tc_period;
1077         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
1078                 if (params->qsize[i])
1079                         s->tc_credits[i] = s->tc_credits_per_period[i];
1080
1081         /* compile time checks */
1082         RTE_BUILD_BUG_ON(RTE_SCHED_PORT_N_GRINDERS == 0);
1083         RTE_BUILD_BUG_ON(RTE_SCHED_PORT_N_GRINDERS &
1084                 (RTE_SCHED_PORT_N_GRINDERS - 1));
1085
1086         /* User parameters */
1087         s->n_pipes_per_subport_enabled = params->n_pipes_per_subport_enabled;
1088         memcpy(s->qsize, params->qsize, sizeof(params->qsize));
1089         s->n_pipe_profiles = params->n_pipe_profiles;
1090         s->n_max_pipe_profiles = params->n_max_pipe_profiles;
1091
1092 #ifdef RTE_SCHED_RED
1093         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) {
1094                 uint32_t j;
1095
1096                 for (j = 0; j < RTE_COLORS; j++) {
1097                         /* if min/max are both zero, then RED is disabled */
1098                         if ((params->red_params[i][j].min_th |
1099                              params->red_params[i][j].max_th) == 0) {
1100                                 continue;
1101                         }
1102
1103                         if (rte_red_config_init(&s->red_config[i][j],
1104                                 params->red_params[i][j].wq_log2,
1105                                 params->red_params[i][j].min_th,
1106                                 params->red_params[i][j].max_th,
1107                                 params->red_params[i][j].maxp_inv) != 0) {
1108                                 rte_sched_free_memory(port, n_subports);
1109
1110                                 RTE_LOG(NOTICE, SCHED,
1111                                 "%s: RED configuration init fails\n", __func__);
1112                                 return -EINVAL;
1113                         }
1114                 }
1115         }
1116 #endif
1117
1118         /* Scheduling loop detection */
1119         s->pipe_loop = RTE_SCHED_PIPE_INVALID;
1120         s->pipe_exhaustion = 0;
1121
1122         /* Grinders */
1123         s->busy_grinders = 0;
1124
1125         /* Queue base calculation */
1126         rte_sched_subport_config_qsize(s);
1127
1128         /* Large data structures */
1129         s->pipe = (struct rte_sched_pipe *)
1130                 (s->memory + rte_sched_subport_get_array_base(params,
1131                 e_RTE_SCHED_SUBPORT_ARRAY_PIPE));
1132         s->queue = (struct rte_sched_queue *)
1133                 (s->memory + rte_sched_subport_get_array_base(params,
1134                 e_RTE_SCHED_SUBPORT_ARRAY_QUEUE));
1135         s->queue_extra = (struct rte_sched_queue_extra *)
1136                 (s->memory + rte_sched_subport_get_array_base(params,
1137                 e_RTE_SCHED_SUBPORT_ARRAY_QUEUE_EXTRA));
1138         s->pipe_profiles = (struct rte_sched_pipe_profile *)
1139                 (s->memory + rte_sched_subport_get_array_base(params,
1140                 e_RTE_SCHED_SUBPORT_ARRAY_PIPE_PROFILES));
1141         s->bmp_array =  s->memory + rte_sched_subport_get_array_base(params,
1142                 e_RTE_SCHED_SUBPORT_ARRAY_BMP_ARRAY);
1143         s->queue_array = (struct rte_mbuf **)
1144                 (s->memory + rte_sched_subport_get_array_base(params,
1145                 e_RTE_SCHED_SUBPORT_ARRAY_QUEUE_ARRAY));
1146
1147         /* Pipe profile table */
1148         rte_sched_subport_config_pipe_profile_table(s, params, port->rate);
1149
1150         /* Bitmap */
1151         n_subport_pipe_queues = rte_sched_subport_pipe_queues(s);
1152         bmp_mem_size = rte_bitmap_get_memory_footprint(n_subport_pipe_queues);
1153         s->bmp = rte_bitmap_init(n_subport_pipe_queues, s->bmp_array,
1154                                 bmp_mem_size);
1155         if (s->bmp == NULL) {
1156                 RTE_LOG(ERR, SCHED,
1157                         "%s: Subport bitmap init error\n", __func__);
1158
1159                 rte_sched_free_memory(port, n_subports);
1160                 return -EINVAL;
1161         }
1162
1163         for (i = 0; i < RTE_SCHED_PORT_N_GRINDERS; i++)
1164                 s->grinder_base_bmp_pos[i] = RTE_SCHED_PIPE_INVALID;
1165
1166 #ifdef RTE_SCHED_SUBPORT_TC_OV
1167         /* TC oversubscription */
1168         s->tc_ov_wm_min = port->mtu;
1169         s->tc_ov_wm_max = rte_sched_time_ms_to_bytes(params->tc_period,
1170                                                      s->pipe_tc_be_rate_max);
1171         s->tc_ov_wm = s->tc_ov_wm_max;
1172         s->tc_ov_period_id = 0;
1173         s->tc_ov = 0;
1174         s->tc_ov_n = 0;
1175         s->tc_ov_rate = 0;
1176 #endif
1177
1178         rte_sched_port_log_subport_config(port, subport_id);
1179
1180         return 0;
1181 }
1182
1183 int
1184 rte_sched_pipe_config(struct rte_sched_port *port,
1185         uint32_t subport_id,
1186         uint32_t pipe_id,
1187         int32_t pipe_profile)
1188 {
1189         struct rte_sched_subport *s;
1190         struct rte_sched_pipe *p;
1191         struct rte_sched_pipe_profile *params;
1192         uint32_t n_subports = subport_id + 1;
1193         uint32_t deactivate, profile, i;
1194
1195         /* Check user parameters */
1196         profile = (uint32_t) pipe_profile;
1197         deactivate = (pipe_profile < 0);
1198
1199         if (port == NULL) {
1200                 RTE_LOG(ERR, SCHED,
1201                         "%s: Incorrect value for parameter port\n", __func__);
1202                 return -EINVAL;
1203         }
1204
1205         if (subport_id >= port->n_subports_per_port) {
1206                 RTE_LOG(ERR, SCHED,
1207                         "%s: Incorrect value for parameter subport id\n", __func__);
1208
1209                 rte_sched_free_memory(port, n_subports);
1210                 return -EINVAL;
1211         }
1212
1213         s = port->subports[subport_id];
1214         if (pipe_id >= s->n_pipes_per_subport_enabled) {
1215                 RTE_LOG(ERR, SCHED,
1216                         "%s: Incorrect value for parameter pipe id\n", __func__);
1217
1218                 rte_sched_free_memory(port, n_subports);
1219                 return -EINVAL;
1220         }
1221
1222         if (!deactivate && profile >= s->n_pipe_profiles) {
1223                 RTE_LOG(ERR, SCHED,
1224                         "%s: Incorrect value for parameter pipe profile\n", __func__);
1225
1226                 rte_sched_free_memory(port, n_subports);
1227                 return -EINVAL;
1228         }
1229
1230         /* Handle the case when pipe already has a valid configuration */
1231         p = s->pipe + pipe_id;
1232         if (p->tb_time) {
1233                 params = s->pipe_profiles + p->profile;
1234
1235                 double subport_tc_be_rate =
1236                         (double) s->tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASS_BE]
1237                         / (double) s->tc_period;
1238                 double pipe_tc_be_rate =
1239                         (double) params->tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASS_BE]
1240                         / (double) params->tc_period;
1241                 uint32_t tc_be_ov = s->tc_ov;
1242
1243                 /* Unplug pipe from its subport */
1244                 s->tc_ov_n -= params->tc_ov_weight;
1245                 s->tc_ov_rate -= pipe_tc_be_rate;
1246                 s->tc_ov = s->tc_ov_rate > subport_tc_be_rate;
1247
1248                 if (s->tc_ov != tc_be_ov) {
1249                         RTE_LOG(DEBUG, SCHED,
1250                                 "Subport %u Best-effort TC oversubscription is OFF (%.4lf >= %.4lf)\n",
1251                                 subport_id, subport_tc_be_rate, s->tc_ov_rate);
1252                 }
1253
1254                 /* Reset the pipe */
1255                 memset(p, 0, sizeof(struct rte_sched_pipe));
1256         }
1257
1258         if (deactivate)
1259                 return 0;
1260
1261         /* Apply the new pipe configuration */
1262         p->profile = profile;
1263         params = s->pipe_profiles + p->profile;
1264
1265         /* Token Bucket (TB) */
1266         p->tb_time = port->time;
1267         p->tb_credits = params->tb_size / 2;
1268
1269         /* Traffic Classes (TCs) */
1270         p->tc_time = port->time + params->tc_period;
1271
1272         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
1273                 if (s->qsize[i])
1274                         p->tc_credits[i] = params->tc_credits_per_period[i];
1275
1276         {
1277                 /* Subport best effort tc oversubscription */
1278                 double subport_tc_be_rate =
1279                         (double) s->tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASS_BE]
1280                         / (double) s->tc_period;
1281                 double pipe_tc_be_rate =
1282                         (double) params->tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASS_BE]
1283                         / (double) params->tc_period;
1284                 uint32_t tc_be_ov = s->tc_ov;
1285
1286                 s->tc_ov_n += params->tc_ov_weight;
1287                 s->tc_ov_rate += pipe_tc_be_rate;
1288                 s->tc_ov = s->tc_ov_rate > subport_tc_be_rate;
1289
1290                 if (s->tc_ov != tc_be_ov) {
1291                         RTE_LOG(DEBUG, SCHED,
1292                                 "Subport %u Best effort TC oversubscription is ON (%.4lf < %.4lf)\n",
1293                                 subport_id, subport_tc_be_rate, s->tc_ov_rate);
1294                 }
1295                 p->tc_ov_period_id = s->tc_ov_period_id;
1296                 p->tc_ov_credits = s->tc_ov_wm;
1297         }
1298
1299         return 0;
1300 }
1301
1302 int
1303 rte_sched_subport_pipe_profile_add(struct rte_sched_port *port,
1304         uint32_t subport_id,
1305         struct rte_sched_pipe_params *params,
1306         uint32_t *pipe_profile_id)
1307 {
1308         struct rte_sched_subport *s;
1309         struct rte_sched_pipe_profile *pp;
1310         uint32_t i;
1311         int status;
1312
1313         /* Port */
1314         if (port == NULL) {
1315                 RTE_LOG(ERR, SCHED,
1316                         "%s: Incorrect value for parameter port\n", __func__);
1317                 return -EINVAL;
1318         }
1319
1320         /* Subport id not exceeds the max limit */
1321         if (subport_id > port->n_subports_per_port) {
1322                 RTE_LOG(ERR, SCHED,
1323                         "%s: Incorrect value for subport id\n", __func__);
1324                 return -EINVAL;
1325         }
1326
1327         s = port->subports[subport_id];
1328
1329         /* Pipe profiles exceeds the max limit */
1330         if (s->n_pipe_profiles >= s->n_max_pipe_profiles) {
1331                 RTE_LOG(ERR, SCHED,
1332                         "%s: Number of pipe profiles exceeds the max limit\n", __func__);
1333                 return -EINVAL;
1334         }
1335
1336         /* Pipe params */
1337         status = pipe_profile_check(params, port->rate, &s->qsize[0]);
1338         if (status != 0) {
1339                 RTE_LOG(ERR, SCHED,
1340                         "%s: Pipe profile check failed(%d)\n", __func__, status);
1341                 return -EINVAL;
1342         }
1343
1344         pp = &s->pipe_profiles[s->n_pipe_profiles];
1345         rte_sched_pipe_profile_convert(s, params, pp, port->rate);
1346
1347         /* Pipe profile should not exists */
1348         for (i = 0; i < s->n_pipe_profiles; i++)
1349                 if (memcmp(s->pipe_profiles + i, pp, sizeof(*pp)) == 0) {
1350                         RTE_LOG(ERR, SCHED,
1351                                 "%s: Pipe profile exists\n", __func__);
1352                         return -EINVAL;
1353                 }
1354
1355         /* Pipe profile commit */
1356         *pipe_profile_id = s->n_pipe_profiles;
1357         s->n_pipe_profiles++;
1358
1359         if (s->pipe_tc_be_rate_max < params->tc_rate[RTE_SCHED_TRAFFIC_CLASS_BE])
1360                 s->pipe_tc_be_rate_max = params->tc_rate[RTE_SCHED_TRAFFIC_CLASS_BE];
1361
1362         rte_sched_port_log_pipe_profile(s, *pipe_profile_id);
1363
1364         return 0;
1365 }
1366
1367 static inline uint32_t
1368 rte_sched_port_qindex(struct rte_sched_port *port,
1369         uint32_t subport,
1370         uint32_t pipe,
1371         uint32_t traffic_class,
1372         uint32_t queue)
1373 {
1374         return ((subport & (port->n_subports_per_port - 1)) <<
1375                 (port->n_pipes_per_subport_log2 + 4)) |
1376                 ((pipe &
1377                 (port->subports[subport]->n_pipes_per_subport_enabled - 1)) << 4) |
1378                 ((rte_sched_port_pipe_queue(port, traffic_class) + queue) &
1379                 (RTE_SCHED_QUEUES_PER_PIPE - 1));
1380 }
1381
1382 void
1383 rte_sched_port_pkt_write(struct rte_sched_port *port,
1384                          struct rte_mbuf *pkt,
1385                          uint32_t subport, uint32_t pipe,
1386                          uint32_t traffic_class,
1387                          uint32_t queue, enum rte_color color)
1388 {
1389         uint32_t queue_id =
1390                 rte_sched_port_qindex(port, subport, pipe, traffic_class, queue);
1391
1392         rte_mbuf_sched_set(pkt, queue_id, traffic_class, (uint8_t)color);
1393 }
1394
1395 void
1396 rte_sched_port_pkt_read_tree_path(struct rte_sched_port *port,
1397                                   const struct rte_mbuf *pkt,
1398                                   uint32_t *subport, uint32_t *pipe,
1399                                   uint32_t *traffic_class, uint32_t *queue)
1400 {
1401         uint32_t queue_id = rte_mbuf_sched_queue_get(pkt);
1402
1403         *subport = queue_id >> (port->n_pipes_per_subport_log2 + 4);
1404         *pipe = (queue_id >> 4) &
1405                 (port->subports[*subport]->n_pipes_per_subport_enabled - 1);
1406         *traffic_class = rte_sched_port_pipe_tc(port, queue_id);
1407         *queue = rte_sched_port_tc_queue(port, queue_id);
1408 }
1409
1410 enum rte_color
1411 rte_sched_port_pkt_read_color(const struct rte_mbuf *pkt)
1412 {
1413         return (enum rte_color)rte_mbuf_sched_color_get(pkt);
1414 }
1415
1416 int
1417 rte_sched_subport_read_stats(struct rte_sched_port *port,
1418                              uint32_t subport_id,
1419                              struct rte_sched_subport_stats *stats,
1420                              uint32_t *tc_ov)
1421 {
1422         struct rte_sched_subport *s;
1423
1424         /* Check user parameters */
1425         if (port == NULL) {
1426                 RTE_LOG(ERR, SCHED,
1427                         "%s: Incorrect value for parameter port\n", __func__);
1428                 return -EINVAL;
1429         }
1430
1431         if (subport_id >= port->n_subports_per_port) {
1432                 RTE_LOG(ERR, SCHED,
1433                         "%s: Incorrect value for subport id\n", __func__);
1434                 return -EINVAL;
1435         }
1436
1437         if (stats == NULL) {
1438                 RTE_LOG(ERR, SCHED,
1439                         "%s: Incorrect value for parameter stats\n", __func__);
1440                 return -EINVAL;
1441         }
1442
1443         if (tc_ov == NULL) {
1444                 RTE_LOG(ERR, SCHED,
1445                         "%s: Incorrect value for tc_ov\n", __func__);
1446                 return -EINVAL;
1447         }
1448
1449         s = port->subports[subport_id];
1450
1451         /* Copy subport stats and clear */
1452         memcpy(stats, &s->stats, sizeof(struct rte_sched_subport_stats));
1453         memset(&s->stats, 0, sizeof(struct rte_sched_subport_stats));
1454
1455         /* Subport TC oversubscription status */
1456         *tc_ov = s->tc_ov;
1457
1458         return 0;
1459 }
1460
1461 int
1462 rte_sched_queue_read_stats(struct rte_sched_port *port,
1463         uint32_t queue_id,
1464         struct rte_sched_queue_stats *stats,
1465         uint16_t *qlen)
1466 {
1467         struct rte_sched_queue *q;
1468         struct rte_sched_queue_extra *qe;
1469
1470         /* Check user parameters */
1471         if (port == NULL) {
1472                 RTE_LOG(ERR, SCHED,
1473                         "%s: Incorrect value for parameter port\n", __func__);
1474                 return -EINVAL;
1475         }
1476
1477         if (queue_id >= rte_sched_port_queues_per_port(port)) {
1478                 RTE_LOG(ERR, SCHED,
1479                         "%s: Incorrect value for queue id\n", __func__);
1480                 return -EINVAL;
1481         }
1482
1483         if (stats == NULL) {
1484                 RTE_LOG(ERR, SCHED,
1485                         "%s: Incorrect value for parameter stats\n", __func__);
1486                 return -EINVAL;
1487         }
1488
1489         if (qlen == NULL) {
1490                 RTE_LOG(ERR, SCHED,
1491                         "%s: Incorrect value for parameter qlen\n", __func__);
1492                 return -EINVAL;
1493         }
1494         q = port->queue + queue_id;
1495         qe = port->queue_extra + queue_id;
1496
1497         /* Copy queue stats and clear */
1498         memcpy(stats, &qe->stats, sizeof(struct rte_sched_queue_stats));
1499         memset(&qe->stats, 0, sizeof(struct rte_sched_queue_stats));
1500
1501         /* Queue length */
1502         *qlen = q->qw - q->qr;
1503
1504         return 0;
1505 }
1506
1507 #ifdef RTE_SCHED_DEBUG
1508
1509 static inline int
1510 rte_sched_port_queue_is_empty(struct rte_sched_subport *subport,
1511         uint32_t qindex)
1512 {
1513         struct rte_sched_queue *queue = subport->queue + qindex;
1514
1515         return queue->qr == queue->qw;
1516 }
1517
1518 #endif /* RTE_SCHED_DEBUG */
1519
1520 #ifdef RTE_SCHED_COLLECT_STATS
1521
1522 static inline void
1523 rte_sched_port_update_subport_stats(struct rte_sched_port *port,
1524         struct rte_sched_subport *subport,
1525         uint32_t qindex,
1526         struct rte_mbuf *pkt)
1527 {
1528         uint32_t tc_index = rte_sched_port_pipe_tc(port, qindex);
1529         uint32_t pkt_len = pkt->pkt_len;
1530
1531         subport->stats.n_pkts_tc[tc_index] += 1;
1532         subport->stats.n_bytes_tc[tc_index] += pkt_len;
1533 }
1534
1535 #ifdef RTE_SCHED_RED
1536 static inline void
1537 rte_sched_port_update_subport_stats_on_drop(struct rte_sched_port *port,
1538         struct rte_sched_subport *subport,
1539         uint32_t qindex,
1540         struct rte_mbuf *pkt,
1541         uint32_t red)
1542 #else
1543 static inline void
1544 rte_sched_port_update_subport_stats_on_drop(struct rte_sched_port *port,
1545         struct rte_sched_subport *subport,
1546         uint32_t qindex,
1547         struct rte_mbuf *pkt,
1548         __rte_unused uint32_t red)
1549 #endif
1550 {
1551         uint32_t tc_index = rte_sched_port_pipe_tc(port, qindex);
1552         uint32_t pkt_len = pkt->pkt_len;
1553
1554         subport->stats.n_pkts_tc_dropped[tc_index] += 1;
1555         subport->stats.n_bytes_tc_dropped[tc_index] += pkt_len;
1556 #ifdef RTE_SCHED_RED
1557         subport->stats.n_pkts_red_dropped[tc_index] += red;
1558 #endif
1559 }
1560
1561 static inline void
1562 rte_sched_port_update_queue_stats(struct rte_sched_subport *subport,
1563         uint32_t qindex,
1564         struct rte_mbuf *pkt)
1565 {
1566         struct rte_sched_queue_extra *qe = subport->queue_extra + qindex;
1567         uint32_t pkt_len = pkt->pkt_len;
1568
1569         qe->stats.n_pkts += 1;
1570         qe->stats.n_bytes += pkt_len;
1571 }
1572
1573 #ifdef RTE_SCHED_RED
1574 static inline void
1575 rte_sched_port_update_queue_stats_on_drop(struct rte_sched_subport *subport,
1576         uint32_t qindex,
1577         struct rte_mbuf *pkt,
1578         uint32_t red)
1579 #else
1580 static inline void
1581 rte_sched_port_update_queue_stats_on_drop(struct rte_sched_subport *subport,
1582         uint32_t qindex,
1583         struct rte_mbuf *pkt,
1584         __rte_unused uint32_t red)
1585 #endif
1586 {
1587         struct rte_sched_queue_extra *qe = subport->queue_extra + qindex;
1588         uint32_t pkt_len = pkt->pkt_len;
1589
1590         qe->stats.n_pkts_dropped += 1;
1591         qe->stats.n_bytes_dropped += pkt_len;
1592 #ifdef RTE_SCHED_RED
1593         qe->stats.n_pkts_red_dropped += red;
1594 #endif
1595 }
1596
1597 #endif /* RTE_SCHED_COLLECT_STATS */
1598
1599 #ifdef RTE_SCHED_RED
1600
1601 static inline int
1602 rte_sched_port_red_drop(struct rte_sched_port *port,
1603         struct rte_sched_subport *subport,
1604         struct rte_mbuf *pkt,
1605         uint32_t qindex,
1606         uint16_t qlen)
1607 {
1608         struct rte_sched_queue_extra *qe;
1609         struct rte_red_config *red_cfg;
1610         struct rte_red *red;
1611         uint32_t tc_index;
1612         enum rte_color color;
1613
1614         tc_index = rte_sched_port_pipe_tc(port, qindex);
1615         color = rte_sched_port_pkt_read_color(pkt);
1616         red_cfg = &subport->red_config[tc_index][color];
1617
1618         if ((red_cfg->min_th | red_cfg->max_th) == 0)
1619                 return 0;
1620
1621         qe = subport->queue_extra + qindex;
1622         red = &qe->red;
1623
1624         return rte_red_enqueue(red_cfg, red, qlen, port->time);
1625 }
1626
1627 static inline void
1628 rte_sched_port_set_queue_empty_timestamp(struct rte_sched_port *port,
1629         struct rte_sched_subport *subport, uint32_t qindex)
1630 {
1631         struct rte_sched_queue_extra *qe = subport->queue_extra + qindex;
1632         struct rte_red *red = &qe->red;
1633
1634         rte_red_mark_queue_empty(red, port->time);
1635 }
1636
1637 #else
1638
1639 static inline int rte_sched_port_red_drop(struct rte_sched_port *port __rte_unused,
1640         struct rte_sched_subport *subport __rte_unused,
1641         struct rte_mbuf *pkt __rte_unused,
1642         uint32_t qindex __rte_unused,
1643         uint16_t qlen __rte_unused)
1644 {
1645         return 0;
1646 }
1647
1648 #define rte_sched_port_set_queue_empty_timestamp(port, subport, qindex)
1649
1650 #endif /* RTE_SCHED_RED */
1651
1652 #ifdef RTE_SCHED_DEBUG
1653
1654 static inline void
1655 debug_check_queue_slab(struct rte_sched_subport *subport, uint32_t bmp_pos,
1656                        uint64_t bmp_slab)
1657 {
1658         uint64_t mask;
1659         uint32_t i, panic;
1660
1661         if (bmp_slab == 0)
1662                 rte_panic("Empty slab at position %u\n", bmp_pos);
1663
1664         panic = 0;
1665         for (i = 0, mask = 1; i < 64; i++, mask <<= 1) {
1666                 if (mask & bmp_slab) {
1667                         if (rte_sched_port_queue_is_empty(subport, bmp_pos + i)) {
1668                                 printf("Queue %u (slab offset %u) is empty\n", bmp_pos + i, i);
1669                                 panic = 1;
1670                         }
1671                 }
1672         }
1673
1674         if (panic)
1675                 rte_panic("Empty queues in slab 0x%" PRIx64 "starting at position %u\n",
1676                         bmp_slab, bmp_pos);
1677 }
1678
1679 #endif /* RTE_SCHED_DEBUG */
1680
1681 static inline struct rte_sched_subport *
1682 rte_sched_port_subport(struct rte_sched_port *port,
1683         struct rte_mbuf *pkt)
1684 {
1685         uint32_t queue_id = rte_mbuf_sched_queue_get(pkt);
1686         uint32_t subport_id = queue_id >> (port->n_pipes_per_subport_log2 + 4);
1687
1688         return port->subports[subport_id];
1689 }
1690
1691 static inline uint32_t
1692 rte_sched_port_enqueue_qptrs_prefetch0(struct rte_sched_subport *subport,
1693         struct rte_mbuf *pkt, uint32_t subport_qmask)
1694 {
1695         struct rte_sched_queue *q;
1696 #ifdef RTE_SCHED_COLLECT_STATS
1697         struct rte_sched_queue_extra *qe;
1698 #endif
1699         uint32_t qindex = rte_mbuf_sched_queue_get(pkt);
1700         uint32_t subport_queue_id = subport_qmask & qindex;
1701
1702         q = subport->queue + subport_queue_id;
1703         rte_prefetch0(q);
1704 #ifdef RTE_SCHED_COLLECT_STATS
1705         qe = subport->queue_extra + subport_queue_id;
1706         rte_prefetch0(qe);
1707 #endif
1708
1709         return subport_queue_id;
1710 }
1711
1712 static inline void
1713 rte_sched_port_enqueue_qwa_prefetch0(struct rte_sched_port *port,
1714         struct rte_sched_subport *subport,
1715         uint32_t qindex,
1716         struct rte_mbuf **qbase)
1717 {
1718         struct rte_sched_queue *q;
1719         struct rte_mbuf **q_qw;
1720         uint16_t qsize;
1721
1722         q = subport->queue + qindex;
1723         qsize = rte_sched_subport_pipe_qsize(port, subport, qindex);
1724         q_qw = qbase + (q->qw & (qsize - 1));
1725
1726         rte_prefetch0(q_qw);
1727         rte_bitmap_prefetch0(subport->bmp, qindex);
1728 }
1729
1730 static inline int
1731 rte_sched_port_enqueue_qwa(struct rte_sched_port *port,
1732         struct rte_sched_subport *subport,
1733         uint32_t qindex,
1734         struct rte_mbuf **qbase,
1735         struct rte_mbuf *pkt)
1736 {
1737         struct rte_sched_queue *q;
1738         uint16_t qsize;
1739         uint16_t qlen;
1740
1741         q = subport->queue + qindex;
1742         qsize = rte_sched_subport_pipe_qsize(port, subport, qindex);
1743         qlen = q->qw - q->qr;
1744
1745         /* Drop the packet (and update drop stats) when queue is full */
1746         if (unlikely(rte_sched_port_red_drop(port, subport, pkt, qindex, qlen) ||
1747                      (qlen >= qsize))) {
1748                 rte_pktmbuf_free(pkt);
1749 #ifdef RTE_SCHED_COLLECT_STATS
1750                 rte_sched_port_update_subport_stats_on_drop(port, subport,
1751                         qindex, pkt, qlen < qsize);
1752                 rte_sched_port_update_queue_stats_on_drop(subport, qindex, pkt,
1753                         qlen < qsize);
1754 #endif
1755                 return 0;
1756         }
1757
1758         /* Enqueue packet */
1759         qbase[q->qw & (qsize - 1)] = pkt;
1760         q->qw++;
1761
1762         /* Activate queue in the subport bitmap */
1763         rte_bitmap_set(subport->bmp, qindex);
1764
1765         /* Statistics */
1766 #ifdef RTE_SCHED_COLLECT_STATS
1767         rte_sched_port_update_subport_stats(port, subport, qindex, pkt);
1768         rte_sched_port_update_queue_stats(subport, qindex, pkt);
1769 #endif
1770
1771         return 1;
1772 }
1773
1774
1775 /*
1776  * The enqueue function implements a 4-level pipeline with each stage
1777  * processing two different packets. The purpose of using a pipeline
1778  * is to hide the latency of prefetching the data structures. The
1779  * naming convention is presented in the diagram below:
1780  *
1781  *   p00  _______   p10  _______   p20  _______   p30  _______
1782  * ----->|       |----->|       |----->|       |----->|       |----->
1783  *       |   0   |      |   1   |      |   2   |      |   3   |
1784  * ----->|_______|----->|_______|----->|_______|----->|_______|----->
1785  *   p01            p11            p21            p31
1786  *
1787  */
1788 int
1789 rte_sched_port_enqueue(struct rte_sched_port *port, struct rte_mbuf **pkts,
1790                        uint32_t n_pkts)
1791 {
1792         struct rte_mbuf *pkt00, *pkt01, *pkt10, *pkt11, *pkt20, *pkt21,
1793                 *pkt30, *pkt31, *pkt_last;
1794         struct rte_mbuf **q00_base, **q01_base, **q10_base, **q11_base,
1795                 **q20_base, **q21_base, **q30_base, **q31_base, **q_last_base;
1796         struct rte_sched_subport *subport00, *subport01, *subport10, *subport11,
1797                 *subport20, *subport21, *subport30, *subport31, *subport_last;
1798         uint32_t q00, q01, q10, q11, q20, q21, q30, q31, q_last;
1799         uint32_t r00, r01, r10, r11, r20, r21, r30, r31, r_last;
1800         uint32_t subport_qmask;
1801         uint32_t result, i;
1802
1803         result = 0;
1804         subport_qmask = (1 << (port->n_pipes_per_subport_log2 + 4)) - 1;
1805
1806         /*
1807          * Less then 6 input packets available, which is not enough to
1808          * feed the pipeline
1809          */
1810         if (unlikely(n_pkts < 6)) {
1811                 struct rte_sched_subport *subports[5];
1812                 struct rte_mbuf **q_base[5];
1813                 uint32_t q[5];
1814
1815                 /* Prefetch the mbuf structure of each packet */
1816                 for (i = 0; i < n_pkts; i++)
1817                         rte_prefetch0(pkts[i]);
1818
1819                 /* Prefetch the subport structure for each packet */
1820                 for (i = 0; i < n_pkts; i++)
1821                         subports[i] = rte_sched_port_subport(port, pkts[i]);
1822
1823                 /* Prefetch the queue structure for each queue */
1824                 for (i = 0; i < n_pkts; i++)
1825                         q[i] = rte_sched_port_enqueue_qptrs_prefetch0(subports[i],
1826                                         pkts[i], subport_qmask);
1827
1828                 /* Prefetch the write pointer location of each queue */
1829                 for (i = 0; i < n_pkts; i++) {
1830                         q_base[i] = rte_sched_subport_pipe_qbase(subports[i], q[i]);
1831                         rte_sched_port_enqueue_qwa_prefetch0(port, subports[i],
1832                                 q[i], q_base[i]);
1833                 }
1834
1835                 /* Write each packet to its queue */
1836                 for (i = 0; i < n_pkts; i++)
1837                         result += rte_sched_port_enqueue_qwa(port, subports[i],
1838                                                 q[i], q_base[i], pkts[i]);
1839
1840                 return result;
1841         }
1842
1843         /* Feed the first 3 stages of the pipeline (6 packets needed) */
1844         pkt20 = pkts[0];
1845         pkt21 = pkts[1];
1846         rte_prefetch0(pkt20);
1847         rte_prefetch0(pkt21);
1848
1849         pkt10 = pkts[2];
1850         pkt11 = pkts[3];
1851         rte_prefetch0(pkt10);
1852         rte_prefetch0(pkt11);
1853
1854         subport20 = rte_sched_port_subport(port, pkt20);
1855         subport21 = rte_sched_port_subport(port, pkt21);
1856         q20 = rte_sched_port_enqueue_qptrs_prefetch0(subport20,
1857                         pkt20, subport_qmask);
1858         q21 = rte_sched_port_enqueue_qptrs_prefetch0(subport21,
1859                         pkt21, subport_qmask);
1860
1861         pkt00 = pkts[4];
1862         pkt01 = pkts[5];
1863         rte_prefetch0(pkt00);
1864         rte_prefetch0(pkt01);
1865
1866         subport10 = rte_sched_port_subport(port, pkt10);
1867         subport11 = rte_sched_port_subport(port, pkt11);
1868         q10 = rte_sched_port_enqueue_qptrs_prefetch0(subport10,
1869                         pkt10, subport_qmask);
1870         q11 = rte_sched_port_enqueue_qptrs_prefetch0(subport11,
1871                         pkt11, subport_qmask);
1872
1873         q20_base = rte_sched_subport_pipe_qbase(subport20, q20);
1874         q21_base = rte_sched_subport_pipe_qbase(subport21, q21);
1875         rte_sched_port_enqueue_qwa_prefetch0(port, subport20, q20, q20_base);
1876         rte_sched_port_enqueue_qwa_prefetch0(port, subport21, q21, q21_base);
1877
1878         /* Run the pipeline */
1879         for (i = 6; i < (n_pkts & (~1)); i += 2) {
1880                 /* Propagate stage inputs */
1881                 pkt30 = pkt20;
1882                 pkt31 = pkt21;
1883                 pkt20 = pkt10;
1884                 pkt21 = pkt11;
1885                 pkt10 = pkt00;
1886                 pkt11 = pkt01;
1887                 q30 = q20;
1888                 q31 = q21;
1889                 q20 = q10;
1890                 q21 = q11;
1891                 subport30 = subport20;
1892                 subport31 = subport21;
1893                 subport20 = subport10;
1894                 subport21 = subport11;
1895                 q30_base = q20_base;
1896                 q31_base = q21_base;
1897
1898                 /* Stage 0: Get packets in */
1899                 pkt00 = pkts[i];
1900                 pkt01 = pkts[i + 1];
1901                 rte_prefetch0(pkt00);
1902                 rte_prefetch0(pkt01);
1903
1904                 /* Stage 1: Prefetch subport and queue structure storing queue pointers */
1905                 subport10 = rte_sched_port_subport(port, pkt10);
1906                 subport11 = rte_sched_port_subport(port, pkt11);
1907                 q10 = rte_sched_port_enqueue_qptrs_prefetch0(subport10,
1908                                 pkt10, subport_qmask);
1909                 q11 = rte_sched_port_enqueue_qptrs_prefetch0(subport11,
1910                                 pkt11, subport_qmask);
1911
1912                 /* Stage 2: Prefetch queue write location */
1913                 q20_base = rte_sched_subport_pipe_qbase(subport20, q20);
1914                 q21_base = rte_sched_subport_pipe_qbase(subport21, q21);
1915                 rte_sched_port_enqueue_qwa_prefetch0(port, subport20, q20, q20_base);
1916                 rte_sched_port_enqueue_qwa_prefetch0(port, subport21, q21, q21_base);
1917
1918                 /* Stage 3: Write packet to queue and activate queue */
1919                 r30 = rte_sched_port_enqueue_qwa(port, subport30,
1920                                 q30, q30_base, pkt30);
1921                 r31 = rte_sched_port_enqueue_qwa(port, subport31,
1922                                 q31, q31_base, pkt31);
1923                 result += r30 + r31;
1924         }
1925
1926         /*
1927          * Drain the pipeline (exactly 6 packets).
1928          * Handle the last packet in the case
1929          * of an odd number of input packets.
1930          */
1931         pkt_last = pkts[n_pkts - 1];
1932         rte_prefetch0(pkt_last);
1933
1934         subport00 = rte_sched_port_subport(port, pkt00);
1935         subport01 = rte_sched_port_subport(port, pkt01);
1936         q00 = rte_sched_port_enqueue_qptrs_prefetch0(subport00,
1937                         pkt00, subport_qmask);
1938         q01 = rte_sched_port_enqueue_qptrs_prefetch0(subport01,
1939                         pkt01, subport_qmask);
1940
1941         q10_base = rte_sched_subport_pipe_qbase(subport10, q10);
1942         q11_base = rte_sched_subport_pipe_qbase(subport11, q11);
1943         rte_sched_port_enqueue_qwa_prefetch0(port, subport10, q10, q10_base);
1944         rte_sched_port_enqueue_qwa_prefetch0(port, subport11, q11, q11_base);
1945
1946         r20 = rte_sched_port_enqueue_qwa(port, subport20,
1947                         q20, q20_base, pkt20);
1948         r21 = rte_sched_port_enqueue_qwa(port, subport21,
1949                         q21, q21_base, pkt21);
1950         result += r20 + r21;
1951
1952         subport_last = rte_sched_port_subport(port, pkt_last);
1953         q_last = rte_sched_port_enqueue_qptrs_prefetch0(subport_last,
1954                                 pkt_last, subport_qmask);
1955
1956         q00_base = rte_sched_subport_pipe_qbase(subport00, q00);
1957         q01_base = rte_sched_subport_pipe_qbase(subport01, q01);
1958         rte_sched_port_enqueue_qwa_prefetch0(port, subport00, q00, q00_base);
1959         rte_sched_port_enqueue_qwa_prefetch0(port, subport01, q01, q01_base);
1960
1961         r10 = rte_sched_port_enqueue_qwa(port, subport10, q10,
1962                         q10_base, pkt10);
1963         r11 = rte_sched_port_enqueue_qwa(port, subport11, q11,
1964                         q11_base, pkt11);
1965         result += r10 + r11;
1966
1967         q_last_base = rte_sched_subport_pipe_qbase(subport_last, q_last);
1968         rte_sched_port_enqueue_qwa_prefetch0(port, subport_last,
1969                 q_last, q_last_base);
1970
1971         r00 = rte_sched_port_enqueue_qwa(port, subport00, q00,
1972                         q00_base, pkt00);
1973         r01 = rte_sched_port_enqueue_qwa(port, subport01, q01,
1974                         q01_base, pkt01);
1975         result += r00 + r01;
1976
1977         if (n_pkts & 1) {
1978                 r_last = rte_sched_port_enqueue_qwa(port, subport_last,
1979                                         q_last, q_last_base, pkt_last);
1980                 result += r_last;
1981         }
1982
1983         return result;
1984 }
1985
1986 #ifndef RTE_SCHED_SUBPORT_TC_OV
1987
1988 static inline void
1989 grinder_credits_update(struct rte_sched_port *port,
1990         struct rte_sched_subport *subport, uint32_t pos)
1991 {
1992         struct rte_sched_grinder *grinder = subport->grinder + pos;
1993         struct rte_sched_pipe *pipe = grinder->pipe;
1994         struct rte_sched_pipe_profile *params = grinder->pipe_params;
1995         uint64_t n_periods;
1996         uint32_t i;
1997
1998         /* Subport TB */
1999         n_periods = (port->time - subport->tb_time) / subport->tb_period;
2000         subport->tb_credits += n_periods * subport->tb_credits_per_period;
2001         subport->tb_credits = rte_sched_min_val_2_u32(subport->tb_credits, subport->tb_size);
2002         subport->tb_time += n_periods * subport->tb_period;
2003
2004         /* Pipe TB */
2005         n_periods = (port->time - pipe->tb_time) / params->tb_period;
2006         pipe->tb_credits += n_periods * params->tb_credits_per_period;
2007         pipe->tb_credits = rte_sched_min_val_2_u32(pipe->tb_credits, params->tb_size);
2008         pipe->tb_time += n_periods * params->tb_period;
2009
2010         /* Subport TCs */
2011         if (unlikely(port->time >= subport->tc_time)) {
2012                 for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
2013                         subport->tc_credits[i] = subport->tc_credits_per_period[i];
2014
2015                 subport->tc_time = port->time + subport->tc_period;
2016         }
2017
2018         /* Pipe TCs */
2019         if (unlikely(port->time >= pipe->tc_time)) {
2020                 for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
2021                         pipe->tc_credits[i] = params->tc_credits_per_period[i];
2022
2023                 pipe->tc_time = port->time + params->tc_period;
2024         }
2025 }
2026
2027 #else
2028
2029 static inline uint32_t
2030 grinder_tc_ov_credits_update(struct rte_sched_port *port,
2031         struct rte_sched_subport *subport)
2032 {
2033         uint32_t tc_ov_consumption[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
2034         uint32_t tc_consumption = 0, tc_ov_consumption_max;
2035         uint32_t tc_ov_wm = subport->tc_ov_wm;
2036         uint32_t i;
2037
2038         if (subport->tc_ov == 0)
2039                 return subport->tc_ov_wm_max;
2040
2041         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASS_BE; i++) {
2042                 tc_ov_consumption[i] =
2043                         subport->tc_credits_per_period[i] - subport->tc_credits[i];
2044                 tc_consumption += tc_ov_consumption[i];
2045         }
2046
2047         tc_ov_consumption[RTE_SCHED_TRAFFIC_CLASS_BE] =
2048                 subport->tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASS_BE] -
2049                 subport->tc_credits[RTE_SCHED_TRAFFIC_CLASS_BE];
2050
2051         tc_ov_consumption_max =
2052                 subport->tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASS_BE] -
2053                         tc_consumption;
2054
2055         if (tc_ov_consumption[RTE_SCHED_TRAFFIC_CLASS_BE] >
2056                 (tc_ov_consumption_max - port->mtu)) {
2057                 tc_ov_wm  -= tc_ov_wm >> 7;
2058                 if (tc_ov_wm < subport->tc_ov_wm_min)
2059                         tc_ov_wm = subport->tc_ov_wm_min;
2060
2061                 return tc_ov_wm;
2062         }
2063
2064         tc_ov_wm += (tc_ov_wm >> 7) + 1;
2065         if (tc_ov_wm > subport->tc_ov_wm_max)
2066                 tc_ov_wm = subport->tc_ov_wm_max;
2067
2068         return tc_ov_wm;
2069 }
2070
2071 static inline void
2072 grinder_credits_update(struct rte_sched_port *port,
2073         struct rte_sched_subport *subport, uint32_t pos)
2074 {
2075         struct rte_sched_grinder *grinder = subport->grinder + pos;
2076         struct rte_sched_pipe *pipe = grinder->pipe;
2077         struct rte_sched_pipe_profile *params = grinder->pipe_params;
2078         uint64_t n_periods;
2079         uint32_t i;
2080
2081         /* Subport TB */
2082         n_periods = (port->time - subport->tb_time) / subport->tb_period;
2083         subport->tb_credits += n_periods * subport->tb_credits_per_period;
2084         subport->tb_credits = rte_sched_min_val_2_u32(subport->tb_credits, subport->tb_size);
2085         subport->tb_time += n_periods * subport->tb_period;
2086
2087         /* Pipe TB */
2088         n_periods = (port->time - pipe->tb_time) / params->tb_period;
2089         pipe->tb_credits += n_periods * params->tb_credits_per_period;
2090         pipe->tb_credits = rte_sched_min_val_2_u32(pipe->tb_credits, params->tb_size);
2091         pipe->tb_time += n_periods * params->tb_period;
2092
2093         /* Subport TCs */
2094         if (unlikely(port->time >= subport->tc_time)) {
2095                 subport->tc_ov_wm = grinder_tc_ov_credits_update(port, subport);
2096
2097                 for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
2098                         subport->tc_credits[i] = subport->tc_credits_per_period[i];
2099
2100                 subport->tc_time = port->time + subport->tc_period;
2101                 subport->tc_ov_period_id++;
2102         }
2103
2104         /* Pipe TCs */
2105         if (unlikely(port->time >= pipe->tc_time)) {
2106                 for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
2107                         pipe->tc_credits[i] = params->tc_credits_per_period[i];
2108                 pipe->tc_time = port->time + params->tc_period;
2109         }
2110
2111         /* Pipe TCs - Oversubscription */
2112         if (unlikely(pipe->tc_ov_period_id != subport->tc_ov_period_id)) {
2113                 pipe->tc_ov_credits = subport->tc_ov_wm * params->tc_ov_weight;
2114
2115                 pipe->tc_ov_period_id = subport->tc_ov_period_id;
2116         }
2117 }
2118
2119 #endif /* RTE_SCHED_TS_CREDITS_UPDATE, RTE_SCHED_SUBPORT_TC_OV */
2120
2121
2122 #ifndef RTE_SCHED_SUBPORT_TC_OV
2123
2124 static inline int
2125 grinder_credits_check(struct rte_sched_port *port,
2126         struct rte_sched_subport *subport, uint32_t pos)
2127 {
2128         struct rte_sched_grinder *grinder = subport->grinder + pos;
2129         struct rte_sched_pipe *pipe = grinder->pipe;
2130         struct rte_mbuf *pkt = grinder->pkt;
2131         uint32_t tc_index = grinder->tc_index;
2132         uint32_t pkt_len = pkt->pkt_len + port->frame_overhead;
2133         uint32_t subport_tb_credits = subport->tb_credits;
2134         uint32_t subport_tc_credits = subport->tc_credits[tc_index];
2135         uint32_t pipe_tb_credits = pipe->tb_credits;
2136         uint32_t pipe_tc_credits = pipe->tc_credits[tc_index];
2137         int enough_credits;
2138
2139         /* Check queue credits */
2140         enough_credits = (pkt_len <= subport_tb_credits) &&
2141                 (pkt_len <= subport_tc_credits) &&
2142                 (pkt_len <= pipe_tb_credits) &&
2143                 (pkt_len <= pipe_tc_credits);
2144
2145         if (!enough_credits)
2146                 return 0;
2147
2148         /* Update port credits */
2149         subport->tb_credits -= pkt_len;
2150         subport->tc_credits[tc_index] -= pkt_len;
2151         pipe->tb_credits -= pkt_len;
2152         pipe->tc_credits[tc_index] -= pkt_len;
2153
2154         return 1;
2155 }
2156
2157 #else
2158
2159 static inline int
2160 grinder_credits_check(struct rte_sched_port *port,
2161         struct rte_sched_subport *subport, uint32_t pos)
2162 {
2163         struct rte_sched_grinder *grinder = subport->grinder + pos;
2164         struct rte_sched_pipe *pipe = grinder->pipe;
2165         struct rte_mbuf *pkt = grinder->pkt;
2166         uint32_t tc_index = grinder->tc_index;
2167         uint32_t pkt_len = pkt->pkt_len + port->frame_overhead;
2168         uint32_t subport_tb_credits = subport->tb_credits;
2169         uint32_t subport_tc_credits = subport->tc_credits[tc_index];
2170         uint32_t pipe_tb_credits = pipe->tb_credits;
2171         uint32_t pipe_tc_credits = pipe->tc_credits[tc_index];
2172         uint32_t pipe_tc_ov_mask1[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
2173         uint32_t pipe_tc_ov_mask2[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE] = {0};
2174         uint32_t pipe_tc_ov_credits, i;
2175         int enough_credits;
2176
2177         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
2178                 pipe_tc_ov_mask1[i] = UINT32_MAX;
2179
2180         pipe_tc_ov_mask1[RTE_SCHED_TRAFFIC_CLASS_BE] = pipe->tc_ov_credits;
2181         pipe_tc_ov_mask2[RTE_SCHED_TRAFFIC_CLASS_BE] = UINT32_MAX;
2182         pipe_tc_ov_credits = pipe_tc_ov_mask1[tc_index];
2183
2184         /* Check pipe and subport credits */
2185         enough_credits = (pkt_len <= subport_tb_credits) &&
2186                 (pkt_len <= subport_tc_credits) &&
2187                 (pkt_len <= pipe_tb_credits) &&
2188                 (pkt_len <= pipe_tc_credits) &&
2189                 (pkt_len <= pipe_tc_ov_credits);
2190
2191         if (!enough_credits)
2192                 return 0;
2193
2194         /* Update pipe and subport credits */
2195         subport->tb_credits -= pkt_len;
2196         subport->tc_credits[tc_index] -= pkt_len;
2197         pipe->tb_credits -= pkt_len;
2198         pipe->tc_credits[tc_index] -= pkt_len;
2199         pipe->tc_ov_credits -= pipe_tc_ov_mask2[tc_index] & pkt_len;
2200
2201         return 1;
2202 }
2203
2204 #endif /* RTE_SCHED_SUBPORT_TC_OV */
2205
2206
2207 static inline int
2208 grinder_schedule(struct rte_sched_port *port,
2209         struct rte_sched_subport *subport, uint32_t pos)
2210 {
2211         struct rte_sched_grinder *grinder = subport->grinder + pos;
2212         struct rte_sched_queue *queue = grinder->queue[grinder->qpos];
2213         struct rte_mbuf *pkt = grinder->pkt;
2214         uint32_t pkt_len = pkt->pkt_len + port->frame_overhead;
2215         uint32_t be_tc_active;
2216
2217         if (!grinder_credits_check(port, subport, pos))
2218                 return 0;
2219
2220         /* Advance port time */
2221         port->time += pkt_len;
2222
2223         /* Send packet */
2224         port->pkts_out[port->n_pkts_out++] = pkt;
2225         queue->qr++;
2226
2227         be_tc_active = (grinder->tc_index == RTE_SCHED_TRAFFIC_CLASS_BE) ? ~0x0 : 0x0;
2228         grinder->wrr_tokens[grinder->qpos] +=
2229                 (pkt_len * grinder->wrr_cost[grinder->qpos]) & be_tc_active;
2230
2231         if (queue->qr == queue->qw) {
2232                 uint32_t qindex = grinder->qindex[grinder->qpos];
2233
2234                 rte_bitmap_clear(subport->bmp, qindex);
2235                 grinder->qmask &= ~(1 << grinder->qpos);
2236                 if (be_tc_active)
2237                         grinder->wrr_mask[grinder->qpos] = 0;
2238                 rte_sched_port_set_queue_empty_timestamp(port, subport, qindex);
2239         }
2240
2241         /* Reset pipe loop detection */
2242         subport->pipe_loop = RTE_SCHED_PIPE_INVALID;
2243         grinder->productive = 1;
2244
2245         return 1;
2246 }
2247
2248 #ifdef SCHED_VECTOR_SSE4
2249
2250 static inline int
2251 grinder_pipe_exists(struct rte_sched_subport *subport, uint32_t base_pipe)
2252 {
2253         __m128i index = _mm_set1_epi32(base_pipe);
2254         __m128i pipes = _mm_load_si128((__m128i *)subport->grinder_base_bmp_pos);
2255         __m128i res = _mm_cmpeq_epi32(pipes, index);
2256
2257         pipes = _mm_load_si128((__m128i *)(subport->grinder_base_bmp_pos + 4));
2258         pipes = _mm_cmpeq_epi32(pipes, index);
2259         res = _mm_or_si128(res, pipes);
2260
2261         if (_mm_testz_si128(res, res))
2262                 return 0;
2263
2264         return 1;
2265 }
2266
2267 #elif defined(SCHED_VECTOR_NEON)
2268
2269 static inline int
2270 grinder_pipe_exists(struct rte_sched_subport *subport, uint32_t base_pipe)
2271 {
2272         uint32x4_t index, pipes;
2273         uint32_t *pos = (uint32_t *)subport->grinder_base_bmp_pos;
2274
2275         index = vmovq_n_u32(base_pipe);
2276         pipes = vld1q_u32(pos);
2277         if (!vminvq_u32(veorq_u32(pipes, index)))
2278                 return 1;
2279
2280         pipes = vld1q_u32(pos + 4);
2281         if (!vminvq_u32(veorq_u32(pipes, index)))
2282                 return 1;
2283
2284         return 0;
2285 }
2286
2287 #else
2288
2289 static inline int
2290 grinder_pipe_exists(struct rte_sched_subport *subport, uint32_t base_pipe)
2291 {
2292         uint32_t i;
2293
2294         for (i = 0; i < RTE_SCHED_PORT_N_GRINDERS; i++) {
2295                 if (subport->grinder_base_bmp_pos[i] == base_pipe)
2296                         return 1;
2297         }
2298
2299         return 0;
2300 }
2301
2302 #endif /* RTE_SCHED_OPTIMIZATIONS */
2303
2304 static inline void
2305 grinder_pcache_populate(struct rte_sched_subport *subport,
2306         uint32_t pos, uint32_t bmp_pos, uint64_t bmp_slab)
2307 {
2308         struct rte_sched_grinder *grinder = subport->grinder + pos;
2309         uint16_t w[4];
2310
2311         grinder->pcache_w = 0;
2312         grinder->pcache_r = 0;
2313
2314         w[0] = (uint16_t) bmp_slab;
2315         w[1] = (uint16_t) (bmp_slab >> 16);
2316         w[2] = (uint16_t) (bmp_slab >> 32);
2317         w[3] = (uint16_t) (bmp_slab >> 48);
2318
2319         grinder->pcache_qmask[grinder->pcache_w] = w[0];
2320         grinder->pcache_qindex[grinder->pcache_w] = bmp_pos;
2321         grinder->pcache_w += (w[0] != 0);
2322
2323         grinder->pcache_qmask[grinder->pcache_w] = w[1];
2324         grinder->pcache_qindex[grinder->pcache_w] = bmp_pos + 16;
2325         grinder->pcache_w += (w[1] != 0);
2326
2327         grinder->pcache_qmask[grinder->pcache_w] = w[2];
2328         grinder->pcache_qindex[grinder->pcache_w] = bmp_pos + 32;
2329         grinder->pcache_w += (w[2] != 0);
2330
2331         grinder->pcache_qmask[grinder->pcache_w] = w[3];
2332         grinder->pcache_qindex[grinder->pcache_w] = bmp_pos + 48;
2333         grinder->pcache_w += (w[3] != 0);
2334 }
2335
2336 static inline void
2337 grinder_tccache_populate(struct rte_sched_subport *subport,
2338         uint32_t pos, uint32_t qindex, uint16_t qmask)
2339 {
2340         struct rte_sched_grinder *grinder = subport->grinder + pos;
2341         uint8_t b, i;
2342
2343         grinder->tccache_w = 0;
2344         grinder->tccache_r = 0;
2345
2346         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASS_BE; i++) {
2347                 b = (uint8_t) ((qmask >> i) & 0x1);
2348                 grinder->tccache_qmask[grinder->tccache_w] = b;
2349                 grinder->tccache_qindex[grinder->tccache_w] = qindex + i;
2350                 grinder->tccache_w += (b != 0);
2351         }
2352
2353         b = (uint8_t) (qmask >> (RTE_SCHED_TRAFFIC_CLASS_BE));
2354         grinder->tccache_qmask[grinder->tccache_w] = b;
2355         grinder->tccache_qindex[grinder->tccache_w] = qindex +
2356                 RTE_SCHED_TRAFFIC_CLASS_BE;
2357         grinder->tccache_w += (b != 0);
2358 }
2359
2360 static inline int
2361 grinder_next_tc(struct rte_sched_port *port,
2362         struct rte_sched_subport *subport, uint32_t pos)
2363 {
2364         struct rte_sched_grinder *grinder = subport->grinder + pos;
2365         struct rte_mbuf **qbase;
2366         uint32_t qindex;
2367         uint16_t qsize;
2368
2369         if (grinder->tccache_r == grinder->tccache_w)
2370                 return 0;
2371
2372         qindex = grinder->tccache_qindex[grinder->tccache_r];
2373         qbase = rte_sched_subport_pipe_qbase(subport, qindex);
2374         qsize = rte_sched_subport_pipe_qsize(port, subport, qindex);
2375
2376         grinder->tc_index = rte_sched_port_pipe_tc(port, qindex);
2377         grinder->qmask = grinder->tccache_qmask[grinder->tccache_r];
2378         grinder->qsize = qsize;
2379
2380         if (grinder->tc_index < RTE_SCHED_TRAFFIC_CLASS_BE) {
2381                 grinder->queue[0] = subport->queue + qindex;
2382                 grinder->qbase[0] = qbase;
2383                 grinder->qindex[0] = qindex;
2384                 grinder->tccache_r++;
2385
2386                 return 1;
2387         }
2388
2389         grinder->queue[0] = subport->queue + qindex;
2390         grinder->queue[1] = subport->queue + qindex + 1;
2391         grinder->queue[2] = subport->queue + qindex + 2;
2392         grinder->queue[3] = subport->queue + qindex + 3;
2393
2394         grinder->qbase[0] = qbase;
2395         grinder->qbase[1] = qbase + qsize;
2396         grinder->qbase[2] = qbase + 2 * qsize;
2397         grinder->qbase[3] = qbase + 3 * qsize;
2398
2399         grinder->qindex[0] = qindex;
2400         grinder->qindex[1] = qindex + 1;
2401         grinder->qindex[2] = qindex + 2;
2402         grinder->qindex[3] = qindex + 3;
2403
2404         grinder->tccache_r++;
2405         return 1;
2406 }
2407
2408 static inline int
2409 grinder_next_pipe(struct rte_sched_port *port,
2410         struct rte_sched_subport *subport, uint32_t pos)
2411 {
2412         struct rte_sched_grinder *grinder = subport->grinder + pos;
2413         uint32_t pipe_qindex;
2414         uint16_t pipe_qmask;
2415
2416         if (grinder->pcache_r < grinder->pcache_w) {
2417                 pipe_qmask = grinder->pcache_qmask[grinder->pcache_r];
2418                 pipe_qindex = grinder->pcache_qindex[grinder->pcache_r];
2419                 grinder->pcache_r++;
2420         } else {
2421                 uint64_t bmp_slab = 0;
2422                 uint32_t bmp_pos = 0;
2423
2424                 /* Get another non-empty pipe group */
2425                 if (unlikely(rte_bitmap_scan(subport->bmp, &bmp_pos, &bmp_slab) <= 0))
2426                         return 0;
2427
2428 #ifdef RTE_SCHED_DEBUG
2429                 debug_check_queue_slab(subport, bmp_pos, bmp_slab);
2430 #endif
2431
2432                 /* Return if pipe group already in one of the other grinders */
2433                 subport->grinder_base_bmp_pos[pos] = RTE_SCHED_BMP_POS_INVALID;
2434                 if (unlikely(grinder_pipe_exists(subport, bmp_pos)))
2435                         return 0;
2436
2437                 subport->grinder_base_bmp_pos[pos] = bmp_pos;
2438
2439                 /* Install new pipe group into grinder's pipe cache */
2440                 grinder_pcache_populate(subport, pos, bmp_pos, bmp_slab);
2441
2442                 pipe_qmask = grinder->pcache_qmask[0];
2443                 pipe_qindex = grinder->pcache_qindex[0];
2444                 grinder->pcache_r = 1;
2445         }
2446
2447         /* Install new pipe in the grinder */
2448         grinder->pindex = pipe_qindex >> 4;
2449         grinder->subport = subport;
2450         grinder->pipe = subport->pipe + grinder->pindex;
2451         grinder->pipe_params = NULL; /* to be set after the pipe structure is prefetched */
2452         grinder->productive = 0;
2453
2454         grinder_tccache_populate(subport, pos, pipe_qindex, pipe_qmask);
2455         grinder_next_tc(port, subport, pos);
2456
2457         /* Check for pipe exhaustion */
2458         if (grinder->pindex == subport->pipe_loop) {
2459                 subport->pipe_exhaustion = 1;
2460                 subport->pipe_loop = RTE_SCHED_PIPE_INVALID;
2461         }
2462
2463         return 1;
2464 }
2465
2466
2467 static inline void
2468 grinder_wrr_load(struct rte_sched_subport *subport, uint32_t pos)
2469 {
2470         struct rte_sched_grinder *grinder = subport->grinder + pos;
2471         struct rte_sched_pipe *pipe = grinder->pipe;
2472         struct rte_sched_pipe_profile *pipe_params = grinder->pipe_params;
2473         uint32_t qmask = grinder->qmask;
2474
2475         grinder->wrr_tokens[0] =
2476                 ((uint16_t) pipe->wrr_tokens[0]) << RTE_SCHED_WRR_SHIFT;
2477         grinder->wrr_tokens[1] =
2478                 ((uint16_t) pipe->wrr_tokens[1]) << RTE_SCHED_WRR_SHIFT;
2479         grinder->wrr_tokens[2] =
2480                 ((uint16_t) pipe->wrr_tokens[2]) << RTE_SCHED_WRR_SHIFT;
2481         grinder->wrr_tokens[3] =
2482                 ((uint16_t) pipe->wrr_tokens[3]) << RTE_SCHED_WRR_SHIFT;
2483
2484         grinder->wrr_mask[0] = (qmask & 0x1) * 0xFFFF;
2485         grinder->wrr_mask[1] = ((qmask >> 1) & 0x1) * 0xFFFF;
2486         grinder->wrr_mask[2] = ((qmask >> 2) & 0x1) * 0xFFFF;
2487         grinder->wrr_mask[3] = ((qmask >> 3) & 0x1) * 0xFFFF;
2488
2489         grinder->wrr_cost[0] = pipe_params->wrr_cost[0];
2490         grinder->wrr_cost[1] = pipe_params->wrr_cost[1];
2491         grinder->wrr_cost[2] = pipe_params->wrr_cost[2];
2492         grinder->wrr_cost[3] = pipe_params->wrr_cost[3];
2493 }
2494
2495 static inline void
2496 grinder_wrr_store(struct rte_sched_subport *subport, uint32_t pos)
2497 {
2498         struct rte_sched_grinder *grinder = subport->grinder + pos;
2499         struct rte_sched_pipe *pipe = grinder->pipe;
2500
2501         pipe->wrr_tokens[0] =
2502                         (grinder->wrr_tokens[0] & grinder->wrr_mask[0]) >>
2503                                 RTE_SCHED_WRR_SHIFT;
2504         pipe->wrr_tokens[1] =
2505                         (grinder->wrr_tokens[1] & grinder->wrr_mask[1]) >>
2506                                 RTE_SCHED_WRR_SHIFT;
2507         pipe->wrr_tokens[2] =
2508                         (grinder->wrr_tokens[2] & grinder->wrr_mask[2]) >>
2509                                 RTE_SCHED_WRR_SHIFT;
2510         pipe->wrr_tokens[3] =
2511                         (grinder->wrr_tokens[3] & grinder->wrr_mask[3]) >>
2512                                 RTE_SCHED_WRR_SHIFT;
2513 }
2514
2515 static inline void
2516 grinder_wrr(struct rte_sched_subport *subport, uint32_t pos)
2517 {
2518         struct rte_sched_grinder *grinder = subport->grinder + pos;
2519         uint16_t wrr_tokens_min;
2520
2521         grinder->wrr_tokens[0] |= ~grinder->wrr_mask[0];
2522         grinder->wrr_tokens[1] |= ~grinder->wrr_mask[1];
2523         grinder->wrr_tokens[2] |= ~grinder->wrr_mask[2];
2524         grinder->wrr_tokens[3] |= ~grinder->wrr_mask[3];
2525
2526         grinder->qpos = rte_min_pos_4_u16(grinder->wrr_tokens);
2527         wrr_tokens_min = grinder->wrr_tokens[grinder->qpos];
2528
2529         grinder->wrr_tokens[0] -= wrr_tokens_min;
2530         grinder->wrr_tokens[1] -= wrr_tokens_min;
2531         grinder->wrr_tokens[2] -= wrr_tokens_min;
2532         grinder->wrr_tokens[3] -= wrr_tokens_min;
2533 }
2534
2535
2536 #define grinder_evict(subport, pos)
2537
2538 static inline void
2539 grinder_prefetch_pipe(struct rte_sched_subport *subport, uint32_t pos)
2540 {
2541         struct rte_sched_grinder *grinder = subport->grinder + pos;
2542
2543         rte_prefetch0(grinder->pipe);
2544         rte_prefetch0(grinder->queue[0]);
2545 }
2546
2547 static inline void
2548 grinder_prefetch_tc_queue_arrays(struct rte_sched_subport *subport, uint32_t pos)
2549 {
2550         struct rte_sched_grinder *grinder = subport->grinder + pos;
2551         uint16_t qsize, qr[RTE_SCHED_MAX_QUEUES_PER_TC];
2552
2553         qsize = grinder->qsize;
2554         grinder->qpos = 0;
2555
2556         if (grinder->tc_index < RTE_SCHED_TRAFFIC_CLASS_BE) {
2557                 qr[0] = grinder->queue[0]->qr & (qsize - 1);
2558
2559                 rte_prefetch0(grinder->qbase[0] + qr[0]);
2560                 return;
2561         }
2562
2563         qr[0] = grinder->queue[0]->qr & (qsize - 1);
2564         qr[1] = grinder->queue[1]->qr & (qsize - 1);
2565         qr[2] = grinder->queue[2]->qr & (qsize - 1);
2566         qr[3] = grinder->queue[3]->qr & (qsize - 1);
2567
2568         rte_prefetch0(grinder->qbase[0] + qr[0]);
2569         rte_prefetch0(grinder->qbase[1] + qr[1]);
2570
2571         grinder_wrr_load(subport, pos);
2572         grinder_wrr(subport, pos);
2573
2574         rte_prefetch0(grinder->qbase[2] + qr[2]);
2575         rte_prefetch0(grinder->qbase[3] + qr[3]);
2576 }
2577
2578 static inline void
2579 grinder_prefetch_mbuf(struct rte_sched_subport *subport, uint32_t pos)
2580 {
2581         struct rte_sched_grinder *grinder = subport->grinder + pos;
2582         uint32_t qpos = grinder->qpos;
2583         struct rte_mbuf **qbase = grinder->qbase[qpos];
2584         uint16_t qsize = grinder->qsize;
2585         uint16_t qr = grinder->queue[qpos]->qr & (qsize - 1);
2586
2587         grinder->pkt = qbase[qr];
2588         rte_prefetch0(grinder->pkt);
2589
2590         if (unlikely((qr & 0x7) == 7)) {
2591                 uint16_t qr_next = (grinder->queue[qpos]->qr + 1) & (qsize - 1);
2592
2593                 rte_prefetch0(qbase + qr_next);
2594         }
2595 }
2596
2597 static inline uint32_t
2598 grinder_handle(struct rte_sched_port *port,
2599         struct rte_sched_subport *subport, uint32_t pos)
2600 {
2601         struct rte_sched_grinder *grinder = subport->grinder + pos;
2602
2603         switch (grinder->state) {
2604         case e_GRINDER_PREFETCH_PIPE:
2605         {
2606                 if (grinder_next_pipe(port, subport, pos)) {
2607                         grinder_prefetch_pipe(subport, pos);
2608                         subport->busy_grinders++;
2609
2610                         grinder->state = e_GRINDER_PREFETCH_TC_QUEUE_ARRAYS;
2611                         return 0;
2612                 }
2613
2614                 return 0;
2615         }
2616
2617         case e_GRINDER_PREFETCH_TC_QUEUE_ARRAYS:
2618         {
2619                 struct rte_sched_pipe *pipe = grinder->pipe;
2620
2621                 grinder->pipe_params = subport->pipe_profiles + pipe->profile;
2622                 grinder_prefetch_tc_queue_arrays(subport, pos);
2623                 grinder_credits_update(port, subport, pos);
2624
2625                 grinder->state = e_GRINDER_PREFETCH_MBUF;
2626                 return 0;
2627         }
2628
2629         case e_GRINDER_PREFETCH_MBUF:
2630         {
2631                 grinder_prefetch_mbuf(subport, pos);
2632
2633                 grinder->state = e_GRINDER_READ_MBUF;
2634                 return 0;
2635         }
2636
2637         case e_GRINDER_READ_MBUF:
2638         {
2639                 uint32_t wrr_active, result = 0;
2640
2641                 result = grinder_schedule(port, subport, pos);
2642
2643                 wrr_active = (grinder->tc_index == RTE_SCHED_TRAFFIC_CLASS_BE);
2644
2645                 /* Look for next packet within the same TC */
2646                 if (result && grinder->qmask) {
2647                         if (wrr_active)
2648                                 grinder_wrr(subport, pos);
2649
2650                         grinder_prefetch_mbuf(subport, pos);
2651
2652                         return 1;
2653                 }
2654
2655                 if (wrr_active)
2656                         grinder_wrr_store(subport, pos);
2657
2658                 /* Look for another active TC within same pipe */
2659                 if (grinder_next_tc(port, subport, pos)) {
2660                         grinder_prefetch_tc_queue_arrays(subport, pos);
2661
2662                         grinder->state = e_GRINDER_PREFETCH_MBUF;
2663                         return result;
2664                 }
2665
2666                 if (grinder->productive == 0 &&
2667                     subport->pipe_loop == RTE_SCHED_PIPE_INVALID)
2668                         subport->pipe_loop = grinder->pindex;
2669
2670                 grinder_evict(subport, pos);
2671
2672                 /* Look for another active pipe */
2673                 if (grinder_next_pipe(port, subport, pos)) {
2674                         grinder_prefetch_pipe(subport, pos);
2675
2676                         grinder->state = e_GRINDER_PREFETCH_TC_QUEUE_ARRAYS;
2677                         return result;
2678                 }
2679
2680                 /* No active pipe found */
2681                 subport->busy_grinders--;
2682
2683                 grinder->state = e_GRINDER_PREFETCH_PIPE;
2684                 return result;
2685         }
2686
2687         default:
2688                 rte_panic("Algorithmic error (invalid state)\n");
2689                 return 0;
2690         }
2691 }
2692
2693 static inline void
2694 rte_sched_port_time_resync(struct rte_sched_port *port)
2695 {
2696         uint64_t cycles = rte_get_tsc_cycles();
2697         uint64_t cycles_diff = cycles - port->time_cpu_cycles;
2698         uint64_t bytes_diff;
2699         uint32_t i;
2700
2701         /* Compute elapsed time in bytes */
2702         bytes_diff = rte_reciprocal_divide(cycles_diff << RTE_SCHED_TIME_SHIFT,
2703                                            port->inv_cycles_per_byte);
2704
2705         /* Advance port time */
2706         port->time_cpu_cycles = cycles;
2707         port->time_cpu_bytes += bytes_diff;
2708         if (port->time < port->time_cpu_bytes)
2709                 port->time = port->time_cpu_bytes;
2710
2711         /* Reset pipe loop detection */
2712         for (i = 0; i < port->n_subports_per_port; i++)
2713                 port->subports[i]->pipe_loop = RTE_SCHED_PIPE_INVALID;
2714 }
2715
2716 static inline int
2717 rte_sched_port_exceptions(struct rte_sched_subport *subport, int second_pass)
2718 {
2719         int exceptions;
2720
2721         /* Check if any exception flag is set */
2722         exceptions = (second_pass && subport->busy_grinders == 0) ||
2723                 (subport->pipe_exhaustion == 1);
2724
2725         /* Clear exception flags */
2726         subport->pipe_exhaustion = 0;
2727
2728         return exceptions;
2729 }
2730
2731 int
2732 rte_sched_port_dequeue(struct rte_sched_port *port, struct rte_mbuf **pkts, uint32_t n_pkts)
2733 {
2734         struct rte_sched_subport *subport;
2735         uint32_t subport_id = port->subport_id;
2736         uint32_t i, n_subports = 0, count;
2737
2738         port->pkts_out = pkts;
2739         port->n_pkts_out = 0;
2740
2741         rte_sched_port_time_resync(port);
2742
2743         /* Take each queue in the grinder one step further */
2744         for (i = 0, count = 0; ; i++)  {
2745                 subport = port->subports[subport_id];
2746
2747                 count += grinder_handle(port, subport,
2748                                 i & (RTE_SCHED_PORT_N_GRINDERS - 1));
2749
2750                 if (count == n_pkts) {
2751                         subport_id++;
2752
2753                         if (subport_id == port->n_subports_per_port)
2754                                 subport_id = 0;
2755
2756                         port->subport_id = subport_id;
2757                         break;
2758                 }
2759
2760                 if (rte_sched_port_exceptions(subport, i >= RTE_SCHED_PORT_N_GRINDERS)) {
2761                         i = 0;
2762                         subport_id++;
2763                         n_subports++;
2764                 }
2765
2766                 if (subport_id == port->n_subports_per_port)
2767                         subport_id = 0;
2768
2769                 if (n_subports == port->n_subports_per_port) {
2770                         port->subport_id = subport_id;
2771                         break;
2772                 }
2773         }
2774
2775         return count;
2776 }