sched: fix 64-bit rate
[dpdk.git] / lib / librte_sched / rte_sched.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4
5 #include <stdio.h>
6 #include <string.h>
7
8 #include <rte_common.h>
9 #include <rte_log.h>
10 #include <rte_memory.h>
11 #include <rte_malloc.h>
12 #include <rte_cycles.h>
13 #include <rte_prefetch.h>
14 #include <rte_branch_prediction.h>
15 #include <rte_mbuf.h>
16 #include <rte_bitmap.h>
17 #include <rte_reciprocal.h>
18
19 #include "rte_sched.h"
20 #include "rte_sched_common.h"
21 #include "rte_approx.h"
22
23 #ifdef __INTEL_COMPILER
24 #pragma warning(disable:2259) /* conversion may lose significant bits */
25 #endif
26
27 #ifdef RTE_SCHED_VECTOR
28 #include <rte_vect.h>
29
30 #ifdef RTE_ARCH_X86
31 #define SCHED_VECTOR_SSE4
32 #elif defined(RTE_MACHINE_CPUFLAG_NEON)
33 #define SCHED_VECTOR_NEON
34 #endif
35
36 #endif
37
38 #define RTE_SCHED_TB_RATE_CONFIG_ERR          (1e-7)
39 #define RTE_SCHED_WRR_SHIFT                   3
40 #define RTE_SCHED_MAX_QUEUES_PER_TC           RTE_SCHED_BE_QUEUES_PER_PIPE
41 #define RTE_SCHED_GRINDER_PCACHE_SIZE         (64 / RTE_SCHED_QUEUES_PER_PIPE)
42 #define RTE_SCHED_PIPE_INVALID                UINT32_MAX
43 #define RTE_SCHED_BMP_POS_INVALID             UINT32_MAX
44
45 /* Scaling for cycles_per_byte calculation
46  * Chosen so that minimum rate is 480 bit/sec
47  */
48 #define RTE_SCHED_TIME_SHIFT                  8
49
50 struct rte_sched_pipe_profile {
51         /* Token bucket (TB) */
52         uint64_t tb_period;
53         uint64_t tb_credits_per_period;
54         uint64_t tb_size;
55
56         /* Pipe traffic classes */
57         uint64_t tc_period;
58         uint64_t tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
59         uint8_t tc_ov_weight;
60
61         /* Pipe best-effort traffic class queues */
62         uint8_t  wrr_cost[RTE_SCHED_BE_QUEUES_PER_PIPE];
63 };
64
65 struct rte_sched_pipe {
66         /* Token bucket (TB) */
67         uint64_t tb_time; /* time of last update */
68         uint64_t tb_credits;
69
70         /* Pipe profile and flags */
71         uint32_t profile;
72
73         /* Traffic classes (TCs) */
74         uint64_t tc_time; /* time of next update */
75         uint64_t tc_credits[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
76
77         /* Weighted Round Robin (WRR) */
78         uint8_t wrr_tokens[RTE_SCHED_BE_QUEUES_PER_PIPE];
79
80         /* TC oversubscription */
81         uint64_t tc_ov_credits;
82         uint8_t tc_ov_period_id;
83 } __rte_cache_aligned;
84
85 struct rte_sched_queue {
86         uint16_t qw;
87         uint16_t qr;
88 };
89
90 struct rte_sched_queue_extra {
91         struct rte_sched_queue_stats stats;
92 #ifdef RTE_SCHED_RED
93         struct rte_red red;
94 #endif
95 };
96
97 enum grinder_state {
98         e_GRINDER_PREFETCH_PIPE = 0,
99         e_GRINDER_PREFETCH_TC_QUEUE_ARRAYS,
100         e_GRINDER_PREFETCH_MBUF,
101         e_GRINDER_READ_MBUF
102 };
103
104 struct rte_sched_grinder {
105         /* Pipe cache */
106         uint16_t pcache_qmask[RTE_SCHED_GRINDER_PCACHE_SIZE];
107         uint32_t pcache_qindex[RTE_SCHED_GRINDER_PCACHE_SIZE];
108         uint32_t pcache_w;
109         uint32_t pcache_r;
110
111         /* Current pipe */
112         enum grinder_state state;
113         uint32_t productive;
114         uint32_t pindex;
115         struct rte_sched_subport *subport;
116         struct rte_sched_pipe *pipe;
117         struct rte_sched_pipe_profile *pipe_params;
118
119         /* TC cache */
120         uint8_t tccache_qmask[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
121         uint32_t tccache_qindex[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
122         uint32_t tccache_w;
123         uint32_t tccache_r;
124
125         /* Current TC */
126         uint32_t tc_index;
127         struct rte_sched_queue *queue[RTE_SCHED_MAX_QUEUES_PER_TC];
128         struct rte_mbuf **qbase[RTE_SCHED_MAX_QUEUES_PER_TC];
129         uint32_t qindex[RTE_SCHED_MAX_QUEUES_PER_TC];
130         uint16_t qsize;
131         uint32_t qmask;
132         uint32_t qpos;
133         struct rte_mbuf *pkt;
134
135         /* WRR */
136         uint16_t wrr_tokens[RTE_SCHED_BE_QUEUES_PER_PIPE];
137         uint16_t wrr_mask[RTE_SCHED_BE_QUEUES_PER_PIPE];
138         uint8_t wrr_cost[RTE_SCHED_BE_QUEUES_PER_PIPE];
139 };
140
141 struct rte_sched_subport {
142         /* Token bucket (TB) */
143         uint64_t tb_time; /* time of last update */
144         uint64_t tb_period;
145         uint64_t tb_credits_per_period;
146         uint64_t tb_size;
147         uint64_t tb_credits;
148
149         /* Traffic classes (TCs) */
150         uint64_t tc_time; /* time of next update */
151         uint64_t tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
152         uint64_t tc_credits[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
153         uint64_t tc_period;
154
155         /* TC oversubscription */
156         uint64_t tc_ov_wm;
157         uint64_t tc_ov_wm_min;
158         uint64_t tc_ov_wm_max;
159         uint8_t tc_ov_period_id;
160         uint8_t tc_ov;
161         uint32_t tc_ov_n;
162         double tc_ov_rate;
163
164         /* Statistics */
165         struct rte_sched_subport_stats stats __rte_cache_aligned;
166
167         /* Subport pipes */
168         uint32_t n_pipes_per_subport_enabled;
169         uint32_t n_pipe_profiles;
170         uint32_t n_max_pipe_profiles;
171
172         /* Pipe best-effort TC rate */
173         uint64_t pipe_tc_be_rate_max;
174
175         /* Pipe queues size */
176         uint16_t qsize[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
177
178 #ifdef RTE_SCHED_RED
179         struct rte_red_config red_config[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE][RTE_COLORS];
180 #endif
181
182         /* Scheduling loop detection */
183         uint32_t pipe_loop;
184         uint32_t pipe_exhaustion;
185
186         /* Bitmap */
187         struct rte_bitmap *bmp;
188         uint32_t grinder_base_bmp_pos[RTE_SCHED_PORT_N_GRINDERS] __rte_aligned_16;
189
190         /* Grinders */
191         struct rte_sched_grinder grinder[RTE_SCHED_PORT_N_GRINDERS];
192         uint32_t busy_grinders;
193
194         /* Queue base calculation */
195         uint32_t qsize_add[RTE_SCHED_QUEUES_PER_PIPE];
196         uint32_t qsize_sum;
197
198         struct rte_sched_pipe *pipe;
199         struct rte_sched_queue *queue;
200         struct rte_sched_queue_extra *queue_extra;
201         struct rte_sched_pipe_profile *pipe_profiles;
202         uint8_t *bmp_array;
203         struct rte_mbuf **queue_array;
204         uint8_t memory[0] __rte_cache_aligned;
205 } __rte_cache_aligned;
206
207 struct rte_sched_port {
208         /* User parameters */
209         uint32_t n_subports_per_port;
210         uint32_t n_pipes_per_subport;
211         uint32_t n_pipes_per_subport_log2;
212         uint16_t pipe_queue[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
213         uint8_t pipe_tc[RTE_SCHED_QUEUES_PER_PIPE];
214         uint8_t tc_queue[RTE_SCHED_QUEUES_PER_PIPE];
215         uint64_t rate;
216         uint32_t mtu;
217         uint32_t frame_overhead;
218         int socket;
219
220         /* Timing */
221         uint64_t time_cpu_cycles;     /* Current CPU time measured in CPU cyles */
222         uint64_t time_cpu_bytes;      /* Current CPU time measured in bytes */
223         uint64_t time;                /* Current NIC TX time measured in bytes */
224         struct rte_reciprocal inv_cycles_per_byte; /* CPU cycles per byte */
225
226         /* Grinders */
227         struct rte_mbuf **pkts_out;
228         uint32_t n_pkts_out;
229         uint32_t subport_id;
230
231         /* Large data structures */
232         struct rte_sched_subport *subports[0] __rte_cache_aligned;
233 } __rte_cache_aligned;
234
235 enum rte_sched_subport_array {
236         e_RTE_SCHED_SUBPORT_ARRAY_PIPE = 0,
237         e_RTE_SCHED_SUBPORT_ARRAY_QUEUE,
238         e_RTE_SCHED_SUBPORT_ARRAY_QUEUE_EXTRA,
239         e_RTE_SCHED_SUBPORT_ARRAY_PIPE_PROFILES,
240         e_RTE_SCHED_SUBPORT_ARRAY_BMP_ARRAY,
241         e_RTE_SCHED_SUBPORT_ARRAY_QUEUE_ARRAY,
242         e_RTE_SCHED_SUBPORT_ARRAY_TOTAL,
243 };
244
245 static inline uint32_t
246 rte_sched_subport_pipe_queues(struct rte_sched_subport *subport)
247 {
248         return RTE_SCHED_QUEUES_PER_PIPE * subport->n_pipes_per_subport_enabled;
249 }
250
251 static inline struct rte_mbuf **
252 rte_sched_subport_pipe_qbase(struct rte_sched_subport *subport, uint32_t qindex)
253 {
254         uint32_t pindex = qindex >> 4;
255         uint32_t qpos = qindex & (RTE_SCHED_QUEUES_PER_PIPE - 1);
256
257         return (subport->queue_array + pindex *
258                 subport->qsize_sum + subport->qsize_add[qpos]);
259 }
260
261 static inline uint16_t
262 rte_sched_subport_pipe_qsize(struct rte_sched_port *port,
263 struct rte_sched_subport *subport, uint32_t qindex)
264 {
265         uint32_t tc = port->pipe_tc[qindex & (RTE_SCHED_QUEUES_PER_PIPE - 1)];
266
267         return subport->qsize[tc];
268 }
269
270 static inline uint32_t
271 rte_sched_port_queues_per_port(struct rte_sched_port *port)
272 {
273         uint32_t n_queues = 0, i;
274
275         for (i = 0; i < port->n_subports_per_port; i++)
276                 n_queues += rte_sched_subport_pipe_queues(port->subports[i]);
277
278         return n_queues;
279 }
280
281 static inline uint16_t
282 rte_sched_port_pipe_queue(struct rte_sched_port *port, uint32_t traffic_class)
283 {
284         uint16_t pipe_queue = port->pipe_queue[traffic_class];
285
286         return pipe_queue;
287 }
288
289 static inline uint8_t
290 rte_sched_port_pipe_tc(struct rte_sched_port *port, uint32_t qindex)
291 {
292         uint8_t pipe_tc = port->pipe_tc[qindex & (RTE_SCHED_QUEUES_PER_PIPE - 1)];
293
294         return pipe_tc;
295 }
296
297 static inline uint8_t
298 rte_sched_port_tc_queue(struct rte_sched_port *port, uint32_t qindex)
299 {
300         uint8_t tc_queue = port->tc_queue[qindex & (RTE_SCHED_QUEUES_PER_PIPE - 1)];
301
302         return tc_queue;
303 }
304
305 static int
306 pipe_profile_check(struct rte_sched_pipe_params *params,
307         uint64_t rate, uint16_t *qsize)
308 {
309         uint32_t i;
310
311         /* Pipe parameters */
312         if (params == NULL) {
313                 RTE_LOG(ERR, SCHED,
314                         "%s: Incorrect value for parameter params\n", __func__);
315                 return -EINVAL;
316         }
317
318         /* TB rate: non-zero, not greater than port rate */
319         if (params->tb_rate == 0 ||
320                 params->tb_rate > rate) {
321                 RTE_LOG(ERR, SCHED,
322                         "%s: Incorrect value for tb rate\n", __func__);
323                 return -EINVAL;
324         }
325
326         /* TB size: non-zero */
327         if (params->tb_size == 0) {
328                 RTE_LOG(ERR, SCHED,
329                         "%s: Incorrect value for tb size\n", __func__);
330                 return -EINVAL;
331         }
332
333         /* TC rate: non-zero if qsize non-zero, less than pipe rate */
334         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) {
335                 if ((qsize[i] == 0 && params->tc_rate[i] != 0) ||
336                         (qsize[i] != 0 && (params->tc_rate[i] == 0 ||
337                         params->tc_rate[i] > params->tb_rate))) {
338                         RTE_LOG(ERR, SCHED,
339                                 "%s: Incorrect value for qsize or tc_rate\n", __func__);
340                         return -EINVAL;
341                 }
342         }
343
344         if (params->tc_rate[RTE_SCHED_TRAFFIC_CLASS_BE] == 0 ||
345                 qsize[RTE_SCHED_TRAFFIC_CLASS_BE] == 0) {
346                 RTE_LOG(ERR, SCHED,
347                         "%s: Incorrect value for be traffic class rate\n", __func__);
348                 return -EINVAL;
349         }
350
351         /* TC period: non-zero */
352         if (params->tc_period == 0) {
353                 RTE_LOG(ERR, SCHED,
354                         "%s: Incorrect value for tc period\n", __func__);
355                 return -EINVAL;
356         }
357
358         /*  Best effort tc oversubscription weight: non-zero */
359         if (params->tc_ov_weight == 0) {
360                 RTE_LOG(ERR, SCHED,
361                         "%s: Incorrect value for tc ov weight\n", __func__);
362                 return -EINVAL;
363         }
364
365         /* Queue WRR weights: non-zero */
366         for (i = 0; i < RTE_SCHED_BE_QUEUES_PER_PIPE; i++) {
367                 if (params->wrr_weights[i] == 0) {
368                         RTE_LOG(ERR, SCHED,
369                                 "%s: Incorrect value for wrr weight\n", __func__);
370                         return -EINVAL;
371                 }
372         }
373
374         return 0;
375 }
376
377 static int
378 rte_sched_port_check_params(struct rte_sched_port_params *params)
379 {
380         if (params == NULL) {
381                 RTE_LOG(ERR, SCHED,
382                         "%s: Incorrect value for parameter params\n", __func__);
383                 return -EINVAL;
384         }
385
386         /* socket */
387         if (params->socket < 0) {
388                 RTE_LOG(ERR, SCHED,
389                         "%s: Incorrect value for socket id\n", __func__);
390                 return -EINVAL;
391         }
392
393         /* rate */
394         if (params->rate == 0) {
395                 RTE_LOG(ERR, SCHED,
396                         "%s: Incorrect value for rate\n", __func__);
397                 return -EINVAL;
398         }
399
400         /* mtu */
401         if (params->mtu == 0) {
402                 RTE_LOG(ERR, SCHED,
403                         "%s: Incorrect value for mtu\n", __func__);
404                 return -EINVAL;
405         }
406
407         /* n_subports_per_port: non-zero, limited to 16 bits, power of 2 */
408         if (params->n_subports_per_port == 0 ||
409             params->n_subports_per_port > 1u << 16 ||
410             !rte_is_power_of_2(params->n_subports_per_port)) {
411                 RTE_LOG(ERR, SCHED,
412                         "%s: Incorrect value for number of subports\n", __func__);
413                 return -EINVAL;
414         }
415
416         /* n_pipes_per_subport: non-zero, power of 2 */
417         if (params->n_pipes_per_subport == 0 ||
418             !rte_is_power_of_2(params->n_pipes_per_subport)) {
419                 RTE_LOG(ERR, SCHED,
420                         "%s: Incorrect value for maximum pipes number\n", __func__);
421                 return -EINVAL;
422         }
423
424         return 0;
425 }
426
427 static uint32_t
428 rte_sched_subport_get_array_base(struct rte_sched_subport_params *params,
429         enum rte_sched_subport_array array)
430 {
431         uint32_t n_pipes_per_subport = params->n_pipes_per_subport_enabled;
432         uint32_t n_subport_pipe_queues =
433                 RTE_SCHED_QUEUES_PER_PIPE * n_pipes_per_subport;
434
435         uint32_t size_pipe = n_pipes_per_subport * sizeof(struct rte_sched_pipe);
436         uint32_t size_queue =
437                 n_subport_pipe_queues * sizeof(struct rte_sched_queue);
438         uint32_t size_queue_extra
439                 = n_subport_pipe_queues * sizeof(struct rte_sched_queue_extra);
440         uint32_t size_pipe_profiles = params->n_max_pipe_profiles *
441                 sizeof(struct rte_sched_pipe_profile);
442         uint32_t size_bmp_array =
443                 rte_bitmap_get_memory_footprint(n_subport_pipe_queues);
444         uint32_t size_per_pipe_queue_array, size_queue_array;
445
446         uint32_t base, i;
447
448         size_per_pipe_queue_array = 0;
449         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) {
450                 if (i < RTE_SCHED_TRAFFIC_CLASS_BE)
451                         size_per_pipe_queue_array +=
452                                 params->qsize[i] * sizeof(struct rte_mbuf *);
453                 else
454                         size_per_pipe_queue_array += RTE_SCHED_MAX_QUEUES_PER_TC *
455                                 params->qsize[i] * sizeof(struct rte_mbuf *);
456         }
457         size_queue_array = n_pipes_per_subport * size_per_pipe_queue_array;
458
459         base = 0;
460
461         if (array == e_RTE_SCHED_SUBPORT_ARRAY_PIPE)
462                 return base;
463         base += RTE_CACHE_LINE_ROUNDUP(size_pipe);
464
465         if (array == e_RTE_SCHED_SUBPORT_ARRAY_QUEUE)
466                 return base;
467         base += RTE_CACHE_LINE_ROUNDUP(size_queue);
468
469         if (array == e_RTE_SCHED_SUBPORT_ARRAY_QUEUE_EXTRA)
470                 return base;
471         base += RTE_CACHE_LINE_ROUNDUP(size_queue_extra);
472
473         if (array == e_RTE_SCHED_SUBPORT_ARRAY_PIPE_PROFILES)
474                 return base;
475         base += RTE_CACHE_LINE_ROUNDUP(size_pipe_profiles);
476
477         if (array == e_RTE_SCHED_SUBPORT_ARRAY_BMP_ARRAY)
478                 return base;
479         base += RTE_CACHE_LINE_ROUNDUP(size_bmp_array);
480
481         if (array == e_RTE_SCHED_SUBPORT_ARRAY_QUEUE_ARRAY)
482                 return base;
483         base += RTE_CACHE_LINE_ROUNDUP(size_queue_array);
484
485         return base;
486 }
487
488 static void
489 rte_sched_subport_config_qsize(struct rte_sched_subport *subport)
490 {
491         uint32_t i;
492
493         subport->qsize_add[0] = 0;
494
495         /* Strict prority traffic class */
496         for (i = 1; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
497                 subport->qsize_add[i] = subport->qsize_add[i-1] + subport->qsize[i-1];
498
499         /* Best-effort traffic class */
500         subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE + 1] =
501                 subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE] +
502                 subport->qsize[RTE_SCHED_TRAFFIC_CLASS_BE];
503         subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE + 2] =
504                 subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE + 1] +
505                 subport->qsize[RTE_SCHED_TRAFFIC_CLASS_BE];
506         subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE + 3] =
507                 subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE + 2] +
508                 subport->qsize[RTE_SCHED_TRAFFIC_CLASS_BE];
509
510         subport->qsize_sum = subport->qsize_add[RTE_SCHED_TRAFFIC_CLASS_BE + 3] +
511                 subport->qsize[RTE_SCHED_TRAFFIC_CLASS_BE];
512 }
513
514 static void
515 rte_sched_port_log_pipe_profile(struct rte_sched_subport *subport, uint32_t i)
516 {
517         struct rte_sched_pipe_profile *p = subport->pipe_profiles + i;
518
519         RTE_LOG(DEBUG, SCHED, "Low level config for pipe profile %u:\n"
520                 "       Token bucket: period = %"PRIu64", credits per period = %"PRIu64", size = %"PRIu64"\n"
521                 "       Traffic classes: period = %"PRIu64",\n"
522                 "       credits per period = [%"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64
523                 ", %"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64
524                 ", %"PRIu64", %"PRIu64", %"PRIu64"]\n"
525                 "       Best-effort traffic class oversubscription: weight = %hhu\n"
526                 "       WRR cost: [%hhu, %hhu, %hhu, %hhu]\n",
527                 i,
528
529                 /* Token bucket */
530                 p->tb_period,
531                 p->tb_credits_per_period,
532                 p->tb_size,
533
534                 /* Traffic classes */
535                 p->tc_period,
536                 p->tc_credits_per_period[0],
537                 p->tc_credits_per_period[1],
538                 p->tc_credits_per_period[2],
539                 p->tc_credits_per_period[3],
540                 p->tc_credits_per_period[4],
541                 p->tc_credits_per_period[5],
542                 p->tc_credits_per_period[6],
543                 p->tc_credits_per_period[7],
544                 p->tc_credits_per_period[8],
545                 p->tc_credits_per_period[9],
546                 p->tc_credits_per_period[10],
547                 p->tc_credits_per_period[11],
548                 p->tc_credits_per_period[12],
549
550                 /* Best-effort traffic class oversubscription */
551                 p->tc_ov_weight,
552
553                 /* WRR */
554                 p->wrr_cost[0], p->wrr_cost[1], p->wrr_cost[2], p->wrr_cost[3]);
555 }
556
557 static inline uint64_t
558 rte_sched_time_ms_to_bytes(uint64_t time_ms, uint64_t rate)
559 {
560         uint64_t time = time_ms;
561
562         time = (time * rate) / 1000;
563
564         return time;
565 }
566
567 static void
568 rte_sched_pipe_profile_convert(struct rte_sched_subport *subport,
569         struct rte_sched_pipe_params *src,
570         struct rte_sched_pipe_profile *dst,
571         uint64_t rate)
572 {
573         uint32_t wrr_cost[RTE_SCHED_BE_QUEUES_PER_PIPE];
574         uint32_t lcd1, lcd2, lcd;
575         uint32_t i;
576
577         /* Token Bucket */
578         if (src->tb_rate == rate) {
579                 dst->tb_credits_per_period = 1;
580                 dst->tb_period = 1;
581         } else {
582                 double tb_rate = (double) src->tb_rate
583                                 / (double) rate;
584                 double d = RTE_SCHED_TB_RATE_CONFIG_ERR;
585
586                 rte_approx_64(tb_rate, d, &dst->tb_credits_per_period,
587                         &dst->tb_period);
588         }
589
590         dst->tb_size = src->tb_size;
591
592         /* Traffic Classes */
593         dst->tc_period = rte_sched_time_ms_to_bytes(src->tc_period,
594                                                 rate);
595
596         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
597                 if (subport->qsize[i])
598                         dst->tc_credits_per_period[i]
599                                 = rte_sched_time_ms_to_bytes(src->tc_period,
600                                         src->tc_rate[i]);
601
602         dst->tc_ov_weight = src->tc_ov_weight;
603
604         /* WRR queues */
605         wrr_cost[0] = src->wrr_weights[0];
606         wrr_cost[1] = src->wrr_weights[1];
607         wrr_cost[2] = src->wrr_weights[2];
608         wrr_cost[3] = src->wrr_weights[3];
609
610         lcd1 = rte_get_lcd(wrr_cost[0], wrr_cost[1]);
611         lcd2 = rte_get_lcd(wrr_cost[2], wrr_cost[3]);
612         lcd = rte_get_lcd(lcd1, lcd2);
613
614         wrr_cost[0] = lcd / wrr_cost[0];
615         wrr_cost[1] = lcd / wrr_cost[1];
616         wrr_cost[2] = lcd / wrr_cost[2];
617         wrr_cost[3] = lcd / wrr_cost[3];
618
619         dst->wrr_cost[0] = (uint8_t) wrr_cost[0];
620         dst->wrr_cost[1] = (uint8_t) wrr_cost[1];
621         dst->wrr_cost[2] = (uint8_t) wrr_cost[2];
622         dst->wrr_cost[3] = (uint8_t) wrr_cost[3];
623 }
624
625 static void
626 rte_sched_subport_config_pipe_profile_table(struct rte_sched_subport *subport,
627         struct rte_sched_subport_params *params, uint64_t rate)
628 {
629         uint32_t i;
630
631         for (i = 0; i < subport->n_pipe_profiles; i++) {
632                 struct rte_sched_pipe_params *src = params->pipe_profiles + i;
633                 struct rte_sched_pipe_profile *dst = subport->pipe_profiles + i;
634
635                 rte_sched_pipe_profile_convert(subport, src, dst, rate);
636                 rte_sched_port_log_pipe_profile(subport, i);
637         }
638
639         subport->pipe_tc_be_rate_max = 0;
640         for (i = 0; i < subport->n_pipe_profiles; i++) {
641                 struct rte_sched_pipe_params *src = params->pipe_profiles + i;
642                 uint64_t pipe_tc_be_rate = src->tc_rate[RTE_SCHED_TRAFFIC_CLASS_BE];
643
644                 if (subport->pipe_tc_be_rate_max < pipe_tc_be_rate)
645                         subport->pipe_tc_be_rate_max = pipe_tc_be_rate;
646         }
647 }
648
649 static int
650 rte_sched_subport_check_params(struct rte_sched_subport_params *params,
651         uint32_t n_max_pipes_per_subport,
652         uint64_t rate)
653 {
654         uint32_t i;
655
656         /* Check user parameters */
657         if (params == NULL) {
658                 RTE_LOG(ERR, SCHED,
659                         "%s: Incorrect value for parameter params\n", __func__);
660                 return -EINVAL;
661         }
662
663         if (params->tb_rate == 0 || params->tb_rate > rate) {
664                 RTE_LOG(ERR, SCHED,
665                         "%s: Incorrect value for tb rate\n", __func__);
666                 return -EINVAL;
667         }
668
669         if (params->tb_size == 0) {
670                 RTE_LOG(ERR, SCHED,
671                         "%s: Incorrect value for tb size\n", __func__);
672                 return -EINVAL;
673         }
674
675         /* qsize: if non-zero, power of 2,
676          * no bigger than 32K (due to 16-bit read/write pointers)
677          */
678         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) {
679                 uint16_t qsize = params->qsize[i];
680
681                 if (qsize != 0 && !rte_is_power_of_2(qsize)) {
682                         RTE_LOG(ERR, SCHED,
683                                 "%s: Incorrect value for qsize\n", __func__);
684                         return -EINVAL;
685                 }
686         }
687
688         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) {
689                 uint64_t tc_rate = params->tc_rate[i];
690                 uint16_t qsize = params->qsize[i];
691
692                 if ((qsize == 0 && tc_rate != 0) ||
693                         (qsize != 0 && tc_rate == 0) ||
694                         (tc_rate > params->tb_rate)) {
695                         RTE_LOG(ERR, SCHED,
696                                 "%s: Incorrect value for tc rate\n", __func__);
697                         return -EINVAL;
698                 }
699         }
700
701         if (params->qsize[RTE_SCHED_TRAFFIC_CLASS_BE] == 0 ||
702                 params->tc_rate[RTE_SCHED_TRAFFIC_CLASS_BE] == 0) {
703                 RTE_LOG(ERR, SCHED,
704                         "%s: Incorrect qsize or tc rate(best effort)\n", __func__);
705                 return -EINVAL;
706         }
707
708         if (params->tc_period == 0) {
709                 RTE_LOG(ERR, SCHED,
710                         "%s: Incorrect value for tc period\n", __func__);
711                 return -EINVAL;
712         }
713
714         /* n_pipes_per_subport: non-zero, power of 2 */
715         if (params->n_pipes_per_subport_enabled == 0 ||
716                 params->n_pipes_per_subport_enabled > n_max_pipes_per_subport ||
717             !rte_is_power_of_2(params->n_pipes_per_subport_enabled)) {
718                 RTE_LOG(ERR, SCHED,
719                         "%s: Incorrect value for pipes number\n", __func__);
720                 return -EINVAL;
721         }
722
723         /* pipe_profiles and n_pipe_profiles */
724         if (params->pipe_profiles == NULL ||
725             params->n_pipe_profiles == 0 ||
726                 params->n_max_pipe_profiles == 0 ||
727                 params->n_pipe_profiles > params->n_max_pipe_profiles) {
728                 RTE_LOG(ERR, SCHED,
729                         "%s: Incorrect value for pipe profiles\n", __func__);
730                 return -EINVAL;
731         }
732
733         for (i = 0; i < params->n_pipe_profiles; i++) {
734                 struct rte_sched_pipe_params *p = params->pipe_profiles + i;
735                 int status;
736
737                 status = pipe_profile_check(p, rate, &params->qsize[0]);
738                 if (status != 0) {
739                         RTE_LOG(ERR, SCHED,
740                                 "%s: Pipe profile check failed(%d)\n", __func__, status);
741                         return -EINVAL;
742                 }
743         }
744
745         return 0;
746 }
747
748 uint32_t
749 rte_sched_port_get_memory_footprint(struct rte_sched_port_params *port_params,
750         struct rte_sched_subport_params **subport_params)
751 {
752         uint32_t size0 = 0, size1 = 0, i;
753         int status;
754
755         status = rte_sched_port_check_params(port_params);
756         if (status != 0) {
757                 RTE_LOG(ERR, SCHED,
758                         "%s: Port scheduler port params check failed (%d)\n",
759                         __func__, status);
760
761                 return 0;
762         }
763
764         for (i = 0; i < port_params->n_subports_per_port; i++) {
765                 struct rte_sched_subport_params *sp = subport_params[i];
766
767                 status = rte_sched_subport_check_params(sp,
768                                 port_params->n_pipes_per_subport,
769                                 port_params->rate);
770                 if (status != 0) {
771                         RTE_LOG(ERR, SCHED,
772                                 "%s: Port scheduler subport params check failed (%d)\n",
773                                 __func__, status);
774
775                         return 0;
776                 }
777         }
778
779         size0 = sizeof(struct rte_sched_port);
780
781         for (i = 0; i < port_params->n_subports_per_port; i++) {
782                 struct rte_sched_subport_params *sp = subport_params[i];
783
784                 size1 += rte_sched_subport_get_array_base(sp,
785                                         e_RTE_SCHED_SUBPORT_ARRAY_TOTAL);
786         }
787
788         return size0 + size1;
789 }
790
791 struct rte_sched_port *
792 rte_sched_port_config(struct rte_sched_port_params *params)
793 {
794         struct rte_sched_port *port = NULL;
795         uint32_t size0, size1;
796         uint32_t cycles_per_byte;
797         uint32_t i, j;
798         int status;
799
800         status = rte_sched_port_check_params(params);
801         if (status != 0) {
802                 RTE_LOG(ERR, SCHED,
803                         "%s: Port scheduler params check failed (%d)\n",
804                         __func__, status);
805                 return NULL;
806         }
807
808         size0 = sizeof(struct rte_sched_port);
809         size1 = params->n_subports_per_port * sizeof(struct rte_sched_subport *);
810
811         /* Allocate memory to store the data structures */
812         port = rte_zmalloc_socket("qos_params", size0 + size1, RTE_CACHE_LINE_SIZE,
813                 params->socket);
814         if (port == NULL) {
815                 RTE_LOG(ERR, SCHED, "%s: Memory allocation fails\n", __func__);
816
817                 return NULL;
818         }
819
820         /* User parameters */
821         port->n_subports_per_port = params->n_subports_per_port;
822         port->n_pipes_per_subport = params->n_pipes_per_subport;
823         port->n_pipes_per_subport_log2 =
824                         __builtin_ctz(params->n_pipes_per_subport);
825         port->socket = params->socket;
826
827         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
828                 port->pipe_queue[i] = i;
829
830         for (i = 0, j = 0; i < RTE_SCHED_QUEUES_PER_PIPE; i++) {
831                 port->pipe_tc[i] = j;
832
833                 if (j < RTE_SCHED_TRAFFIC_CLASS_BE)
834                         j++;
835         }
836
837         for (i = 0, j = 0; i < RTE_SCHED_QUEUES_PER_PIPE; i++) {
838                 port->tc_queue[i] = j;
839
840                 if (i >= RTE_SCHED_TRAFFIC_CLASS_BE)
841                         j++;
842         }
843         port->rate = params->rate;
844         port->mtu = params->mtu + params->frame_overhead;
845         port->frame_overhead = params->frame_overhead;
846
847         /* Timing */
848         port->time_cpu_cycles = rte_get_tsc_cycles();
849         port->time_cpu_bytes = 0;
850         port->time = 0;
851
852         cycles_per_byte = (rte_get_tsc_hz() << RTE_SCHED_TIME_SHIFT)
853                 / params->rate;
854         port->inv_cycles_per_byte = rte_reciprocal_value(cycles_per_byte);
855
856         /* Grinders */
857         port->pkts_out = NULL;
858         port->n_pkts_out = 0;
859         port->subport_id = 0;
860
861         return port;
862 }
863
864 static inline void
865 rte_sched_subport_free(struct rte_sched_port *port,
866         struct rte_sched_subport *subport)
867 {
868         uint32_t n_subport_pipe_queues;
869         uint32_t qindex;
870
871         if (subport == NULL)
872                 return;
873
874         n_subport_pipe_queues = rte_sched_subport_pipe_queues(subport);
875
876         /* Free enqueued mbufs */
877         for (qindex = 0; qindex < n_subport_pipe_queues; qindex++) {
878                 struct rte_mbuf **mbufs =
879                         rte_sched_subport_pipe_qbase(subport, qindex);
880                 uint16_t qsize = rte_sched_subport_pipe_qsize(port, subport, qindex);
881                 if (qsize != 0) {
882                         struct rte_sched_queue *queue = subport->queue + qindex;
883                         uint16_t qr = queue->qr & (qsize - 1);
884                         uint16_t qw = queue->qw & (qsize - 1);
885
886                         for (; qr != qw; qr = (qr + 1) & (qsize - 1))
887                                 rte_pktmbuf_free(mbufs[qr]);
888                 }
889         }
890
891         rte_free(subport);
892 }
893
894 void
895 rte_sched_port_free(struct rte_sched_port *port)
896 {
897         uint32_t i;
898
899         /* Check user parameters */
900         if (port == NULL)
901                 return;
902
903         for (i = 0; i < port->n_subports_per_port; i++)
904                 rte_sched_subport_free(port, port->subports[i]);
905
906         rte_free(port);
907 }
908
909 static void
910 rte_sched_port_log_subport_config(struct rte_sched_port *port, uint32_t i)
911 {
912         struct rte_sched_subport *s = port->subports[i];
913
914         RTE_LOG(DEBUG, SCHED, "Low level config for subport %u:\n"
915                 "       Token bucket: period = %"PRIu64", credits per period = %"PRIu64
916                 ", size = %"PRIu64"\n"
917                 "       Traffic classes: period = %"PRIu64"\n"
918                 "       credits per period = [%"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64
919                 ", %"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64
920                 ", %"PRIu64", %"PRIu64", %"PRIu64"]\n"
921                 "       Best effort traffic class oversubscription: wm min = %"PRIu64
922                 ", wm max = %"PRIu64"\n",
923                 i,
924
925                 /* Token bucket */
926                 s->tb_period,
927                 s->tb_credits_per_period,
928                 s->tb_size,
929
930                 /* Traffic classes */
931                 s->tc_period,
932                 s->tc_credits_per_period[0],
933                 s->tc_credits_per_period[1],
934                 s->tc_credits_per_period[2],
935                 s->tc_credits_per_period[3],
936                 s->tc_credits_per_period[4],
937                 s->tc_credits_per_period[5],
938                 s->tc_credits_per_period[6],
939                 s->tc_credits_per_period[7],
940                 s->tc_credits_per_period[8],
941                 s->tc_credits_per_period[9],
942                 s->tc_credits_per_period[10],
943                 s->tc_credits_per_period[11],
944                 s->tc_credits_per_period[12],
945
946                 /* Best effort traffic class oversubscription */
947                 s->tc_ov_wm_min,
948                 s->tc_ov_wm_max);
949 }
950
951 static void
952 rte_sched_free_memory(struct rte_sched_port *port, uint32_t n_subports)
953 {
954         uint32_t i;
955
956         for (i = 0; i < n_subports; i++) {
957                 struct rte_sched_subport *subport = port->subports[i];
958
959                 rte_sched_subport_free(port, subport);
960         }
961
962         rte_free(port);
963 }
964
965 int
966 rte_sched_subport_config(struct rte_sched_port *port,
967         uint32_t subport_id,
968         struct rte_sched_subport_params *params)
969 {
970         struct rte_sched_subport *s = NULL;
971         uint32_t n_subports = subport_id;
972         uint32_t n_subport_pipe_queues, i;
973         uint32_t size0, size1, bmp_mem_size;
974         int status;
975
976         /* Check user parameters */
977         if (port == NULL) {
978                 RTE_LOG(ERR, SCHED,
979                         "%s: Incorrect value for parameter port\n", __func__);
980                 return 0;
981         }
982
983         if (subport_id >= port->n_subports_per_port) {
984                 RTE_LOG(ERR, SCHED,
985                         "%s: Incorrect value for subport id\n", __func__);
986
987                 rte_sched_free_memory(port, n_subports);
988                 return -EINVAL;
989         }
990
991         status = rte_sched_subport_check_params(params,
992                 port->n_pipes_per_subport,
993                 port->rate);
994         if (status != 0) {
995                 RTE_LOG(NOTICE, SCHED,
996                         "%s: Port scheduler params check failed (%d)\n",
997                         __func__, status);
998
999                 rte_sched_free_memory(port, n_subports);
1000                 return -EINVAL;
1001         }
1002
1003         /* Determine the amount of memory to allocate */
1004         size0 = sizeof(struct rte_sched_subport);
1005         size1 = rte_sched_subport_get_array_base(params,
1006                                 e_RTE_SCHED_SUBPORT_ARRAY_TOTAL);
1007
1008         /* Allocate memory to store the data structures */
1009         s = rte_zmalloc_socket("subport_params", size0 + size1,
1010                 RTE_CACHE_LINE_SIZE, port->socket);
1011         if (s == NULL) {
1012                 RTE_LOG(ERR, SCHED,
1013                         "%s: Memory allocation fails\n", __func__);
1014
1015                 rte_sched_free_memory(port, n_subports);
1016                 return -ENOMEM;
1017         }
1018
1019         n_subports++;
1020
1021         /* Port */
1022         port->subports[subport_id] = s;
1023
1024         /* Token Bucket (TB) */
1025         if (params->tb_rate == port->rate) {
1026                 s->tb_credits_per_period = 1;
1027                 s->tb_period = 1;
1028         } else {
1029                 double tb_rate = ((double) params->tb_rate) / ((double) port->rate);
1030                 double d = RTE_SCHED_TB_RATE_CONFIG_ERR;
1031
1032                 rte_approx_64(tb_rate, d, &s->tb_credits_per_period, &s->tb_period);
1033         }
1034
1035         s->tb_size = params->tb_size;
1036         s->tb_time = port->time;
1037         s->tb_credits = s->tb_size / 2;
1038
1039         /* Traffic Classes (TCs) */
1040         s->tc_period = rte_sched_time_ms_to_bytes(params->tc_period, port->rate);
1041         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) {
1042                 if (params->qsize[i])
1043                         s->tc_credits_per_period[i]
1044                                 = rte_sched_time_ms_to_bytes(params->tc_period,
1045                                         params->tc_rate[i]);
1046         }
1047         s->tc_time = port->time + s->tc_period;
1048         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
1049                 if (params->qsize[i])
1050                         s->tc_credits[i] = s->tc_credits_per_period[i];
1051
1052         /* compile time checks */
1053         RTE_BUILD_BUG_ON(RTE_SCHED_PORT_N_GRINDERS == 0);
1054         RTE_BUILD_BUG_ON(RTE_SCHED_PORT_N_GRINDERS &
1055                 (RTE_SCHED_PORT_N_GRINDERS - 1));
1056
1057         /* User parameters */
1058         s->n_pipes_per_subport_enabled = params->n_pipes_per_subport_enabled;
1059         memcpy(s->qsize, params->qsize, sizeof(params->qsize));
1060         s->n_pipe_profiles = params->n_pipe_profiles;
1061         s->n_max_pipe_profiles = params->n_max_pipe_profiles;
1062
1063 #ifdef RTE_SCHED_RED
1064         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) {
1065                 uint32_t j;
1066
1067                 for (j = 0; j < RTE_COLORS; j++) {
1068                         /* if min/max are both zero, then RED is disabled */
1069                         if ((params->red_params[i][j].min_th |
1070                              params->red_params[i][j].max_th) == 0) {
1071                                 continue;
1072                         }
1073
1074                         if (rte_red_config_init(&s->red_config[i][j],
1075                                 params->red_params[i][j].wq_log2,
1076                                 params->red_params[i][j].min_th,
1077                                 params->red_params[i][j].max_th,
1078                                 params->red_params[i][j].maxp_inv) != 0) {
1079                                 rte_sched_free_memory(port, n_subports);
1080
1081                                 RTE_LOG(NOTICE, SCHED,
1082                                 "%s: RED configuration init fails\n", __func__);
1083                                 return -EINVAL;
1084                         }
1085                 }
1086         }
1087 #endif
1088
1089         /* Scheduling loop detection */
1090         s->pipe_loop = RTE_SCHED_PIPE_INVALID;
1091         s->pipe_exhaustion = 0;
1092
1093         /* Grinders */
1094         s->busy_grinders = 0;
1095
1096         /* Queue base calculation */
1097         rte_sched_subport_config_qsize(s);
1098
1099         /* Large data structures */
1100         s->pipe = (struct rte_sched_pipe *)
1101                 (s->memory + rte_sched_subport_get_array_base(params,
1102                 e_RTE_SCHED_SUBPORT_ARRAY_PIPE));
1103         s->queue = (struct rte_sched_queue *)
1104                 (s->memory + rte_sched_subport_get_array_base(params,
1105                 e_RTE_SCHED_SUBPORT_ARRAY_QUEUE));
1106         s->queue_extra = (struct rte_sched_queue_extra *)
1107                 (s->memory + rte_sched_subport_get_array_base(params,
1108                 e_RTE_SCHED_SUBPORT_ARRAY_QUEUE_EXTRA));
1109         s->pipe_profiles = (struct rte_sched_pipe_profile *)
1110                 (s->memory + rte_sched_subport_get_array_base(params,
1111                 e_RTE_SCHED_SUBPORT_ARRAY_PIPE_PROFILES));
1112         s->bmp_array =  s->memory + rte_sched_subport_get_array_base(params,
1113                 e_RTE_SCHED_SUBPORT_ARRAY_BMP_ARRAY);
1114         s->queue_array = (struct rte_mbuf **)
1115                 (s->memory + rte_sched_subport_get_array_base(params,
1116                 e_RTE_SCHED_SUBPORT_ARRAY_QUEUE_ARRAY));
1117
1118         /* Pipe profile table */
1119         rte_sched_subport_config_pipe_profile_table(s, params, port->rate);
1120
1121         /* Bitmap */
1122         n_subport_pipe_queues = rte_sched_subport_pipe_queues(s);
1123         bmp_mem_size = rte_bitmap_get_memory_footprint(n_subport_pipe_queues);
1124         s->bmp = rte_bitmap_init(n_subport_pipe_queues, s->bmp_array,
1125                                 bmp_mem_size);
1126         if (s->bmp == NULL) {
1127                 RTE_LOG(ERR, SCHED,
1128                         "%s: Subport bitmap init error\n", __func__);
1129
1130                 rte_sched_free_memory(port, n_subports);
1131                 return -EINVAL;
1132         }
1133
1134         for (i = 0; i < RTE_SCHED_PORT_N_GRINDERS; i++)
1135                 s->grinder_base_bmp_pos[i] = RTE_SCHED_PIPE_INVALID;
1136
1137 #ifdef RTE_SCHED_SUBPORT_TC_OV
1138         /* TC oversubscription */
1139         s->tc_ov_wm_min = port->mtu;
1140         s->tc_ov_wm_max = rte_sched_time_ms_to_bytes(params->tc_period,
1141                                                      s->pipe_tc_be_rate_max);
1142         s->tc_ov_wm = s->tc_ov_wm_max;
1143         s->tc_ov_period_id = 0;
1144         s->tc_ov = 0;
1145         s->tc_ov_n = 0;
1146         s->tc_ov_rate = 0;
1147 #endif
1148
1149         rte_sched_port_log_subport_config(port, subport_id);
1150
1151         return 0;
1152 }
1153
1154 int
1155 rte_sched_pipe_config(struct rte_sched_port *port,
1156         uint32_t subport_id,
1157         uint32_t pipe_id,
1158         int32_t pipe_profile)
1159 {
1160         struct rte_sched_subport *s;
1161         struct rte_sched_pipe *p;
1162         struct rte_sched_pipe_profile *params;
1163         uint32_t n_subports = subport_id + 1;
1164         uint32_t deactivate, profile, i;
1165
1166         /* Check user parameters */
1167         profile = (uint32_t) pipe_profile;
1168         deactivate = (pipe_profile < 0);
1169
1170         if (port == NULL) {
1171                 RTE_LOG(ERR, SCHED,
1172                         "%s: Incorrect value for parameter port\n", __func__);
1173                 return -EINVAL;
1174         }
1175
1176         if (subport_id >= port->n_subports_per_port) {
1177                 RTE_LOG(ERR, SCHED,
1178                         "%s: Incorrect value for parameter subport id\n", __func__);
1179
1180                 rte_sched_free_memory(port, n_subports);
1181                 return -EINVAL;
1182         }
1183
1184         s = port->subports[subport_id];
1185         if (pipe_id >= s->n_pipes_per_subport_enabled) {
1186                 RTE_LOG(ERR, SCHED,
1187                         "%s: Incorrect value for parameter pipe id\n", __func__);
1188
1189                 rte_sched_free_memory(port, n_subports);
1190                 return -EINVAL;
1191         }
1192
1193         if (!deactivate && profile >= s->n_pipe_profiles) {
1194                 RTE_LOG(ERR, SCHED,
1195                         "%s: Incorrect value for parameter pipe profile\n", __func__);
1196
1197                 rte_sched_free_memory(port, n_subports);
1198                 return -EINVAL;
1199         }
1200
1201         /* Handle the case when pipe already has a valid configuration */
1202         p = s->pipe + pipe_id;
1203         if (p->tb_time) {
1204                 params = s->pipe_profiles + p->profile;
1205
1206                 double subport_tc_be_rate =
1207                         (double) s->tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASS_BE]
1208                         / (double) s->tc_period;
1209                 double pipe_tc_be_rate =
1210                         (double) params->tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASS_BE]
1211                         / (double) params->tc_period;
1212                 uint32_t tc_be_ov = s->tc_ov;
1213
1214                 /* Unplug pipe from its subport */
1215                 s->tc_ov_n -= params->tc_ov_weight;
1216                 s->tc_ov_rate -= pipe_tc_be_rate;
1217                 s->tc_ov = s->tc_ov_rate > subport_tc_be_rate;
1218
1219                 if (s->tc_ov != tc_be_ov) {
1220                         RTE_LOG(DEBUG, SCHED,
1221                                 "Subport %u Best-effort TC oversubscription is OFF (%.4lf >= %.4lf)\n",
1222                                 subport_id, subport_tc_be_rate, s->tc_ov_rate);
1223                 }
1224
1225                 /* Reset the pipe */
1226                 memset(p, 0, sizeof(struct rte_sched_pipe));
1227         }
1228
1229         if (deactivate)
1230                 return 0;
1231
1232         /* Apply the new pipe configuration */
1233         p->profile = profile;
1234         params = s->pipe_profiles + p->profile;
1235
1236         /* Token Bucket (TB) */
1237         p->tb_time = port->time;
1238         p->tb_credits = params->tb_size / 2;
1239
1240         /* Traffic Classes (TCs) */
1241         p->tc_time = port->time + params->tc_period;
1242
1243         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
1244                 if (s->qsize[i])
1245                         p->tc_credits[i] = params->tc_credits_per_period[i];
1246
1247         {
1248                 /* Subport best effort tc oversubscription */
1249                 double subport_tc_be_rate =
1250                         (double) s->tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASS_BE]
1251                         / (double) s->tc_period;
1252                 double pipe_tc_be_rate =
1253                         (double) params->tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASS_BE]
1254                         / (double) params->tc_period;
1255                 uint32_t tc_be_ov = s->tc_ov;
1256
1257                 s->tc_ov_n += params->tc_ov_weight;
1258                 s->tc_ov_rate += pipe_tc_be_rate;
1259                 s->tc_ov = s->tc_ov_rate > subport_tc_be_rate;
1260
1261                 if (s->tc_ov != tc_be_ov) {
1262                         RTE_LOG(DEBUG, SCHED,
1263                                 "Subport %u Best effort TC oversubscription is ON (%.4lf < %.4lf)\n",
1264                                 subport_id, subport_tc_be_rate, s->tc_ov_rate);
1265                 }
1266                 p->tc_ov_period_id = s->tc_ov_period_id;
1267                 p->tc_ov_credits = s->tc_ov_wm;
1268         }
1269
1270         return 0;
1271 }
1272
1273 int
1274 rte_sched_subport_pipe_profile_add(struct rte_sched_port *port,
1275         uint32_t subport_id,
1276         struct rte_sched_pipe_params *params,
1277         uint32_t *pipe_profile_id)
1278 {
1279         struct rte_sched_subport *s;
1280         struct rte_sched_pipe_profile *pp;
1281         uint32_t i;
1282         int status;
1283
1284         /* Port */
1285         if (port == NULL) {
1286                 RTE_LOG(ERR, SCHED,
1287                         "%s: Incorrect value for parameter port\n", __func__);
1288                 return -EINVAL;
1289         }
1290
1291         /* Subport id not exceeds the max limit */
1292         if (subport_id > port->n_subports_per_port) {
1293                 RTE_LOG(ERR, SCHED,
1294                         "%s: Incorrect value for subport id\n", __func__);
1295                 return -EINVAL;
1296         }
1297
1298         s = port->subports[subport_id];
1299
1300         /* Pipe profiles exceeds the max limit */
1301         if (s->n_pipe_profiles >= s->n_max_pipe_profiles) {
1302                 RTE_LOG(ERR, SCHED,
1303                         "%s: Number of pipe profiles exceeds the max limit\n", __func__);
1304                 return -EINVAL;
1305         }
1306
1307         /* Pipe params */
1308         status = pipe_profile_check(params, port->rate, &s->qsize[0]);
1309         if (status != 0) {
1310                 RTE_LOG(ERR, SCHED,
1311                         "%s: Pipe profile check failed(%d)\n", __func__, status);
1312                 return -EINVAL;
1313         }
1314
1315         pp = &s->pipe_profiles[s->n_pipe_profiles];
1316         rte_sched_pipe_profile_convert(s, params, pp, port->rate);
1317
1318         /* Pipe profile should not exists */
1319         for (i = 0; i < s->n_pipe_profiles; i++)
1320                 if (memcmp(s->pipe_profiles + i, pp, sizeof(*pp)) == 0) {
1321                         RTE_LOG(ERR, SCHED,
1322                                 "%s: Pipe profile exists\n", __func__);
1323                         return -EINVAL;
1324                 }
1325
1326         /* Pipe profile commit */
1327         *pipe_profile_id = s->n_pipe_profiles;
1328         s->n_pipe_profiles++;
1329
1330         if (s->pipe_tc_be_rate_max < params->tc_rate[RTE_SCHED_TRAFFIC_CLASS_BE])
1331                 s->pipe_tc_be_rate_max = params->tc_rate[RTE_SCHED_TRAFFIC_CLASS_BE];
1332
1333         rte_sched_port_log_pipe_profile(s, *pipe_profile_id);
1334
1335         return 0;
1336 }
1337
1338 static inline uint32_t
1339 rte_sched_port_qindex(struct rte_sched_port *port,
1340         uint32_t subport,
1341         uint32_t pipe,
1342         uint32_t traffic_class,
1343         uint32_t queue)
1344 {
1345         return ((subport & (port->n_subports_per_port - 1)) <<
1346                 (port->n_pipes_per_subport_log2 + 4)) |
1347                 ((pipe &
1348                 (port->subports[subport]->n_pipes_per_subport_enabled - 1)) << 4) |
1349                 ((rte_sched_port_pipe_queue(port, traffic_class) + queue) &
1350                 (RTE_SCHED_QUEUES_PER_PIPE - 1));
1351 }
1352
1353 void
1354 rte_sched_port_pkt_write(struct rte_sched_port *port,
1355                          struct rte_mbuf *pkt,
1356                          uint32_t subport, uint32_t pipe,
1357                          uint32_t traffic_class,
1358                          uint32_t queue, enum rte_color color)
1359 {
1360         uint32_t queue_id =
1361                 rte_sched_port_qindex(port, subport, pipe, traffic_class, queue);
1362
1363         rte_mbuf_sched_set(pkt, queue_id, traffic_class, (uint8_t)color);
1364 }
1365
1366 void
1367 rte_sched_port_pkt_read_tree_path(struct rte_sched_port *port,
1368                                   const struct rte_mbuf *pkt,
1369                                   uint32_t *subport, uint32_t *pipe,
1370                                   uint32_t *traffic_class, uint32_t *queue)
1371 {
1372         uint32_t queue_id = rte_mbuf_sched_queue_get(pkt);
1373
1374         *subport = queue_id >> (port->n_pipes_per_subport_log2 + 4);
1375         *pipe = (queue_id >> 4) &
1376                 (port->subports[*subport]->n_pipes_per_subport_enabled - 1);
1377         *traffic_class = rte_sched_port_pipe_tc(port, queue_id);
1378         *queue = rte_sched_port_tc_queue(port, queue_id);
1379 }
1380
1381 enum rte_color
1382 rte_sched_port_pkt_read_color(const struct rte_mbuf *pkt)
1383 {
1384         return (enum rte_color)rte_mbuf_sched_color_get(pkt);
1385 }
1386
1387 int
1388 rte_sched_subport_read_stats(struct rte_sched_port *port,
1389                              uint32_t subport_id,
1390                              struct rte_sched_subport_stats *stats,
1391                              uint32_t *tc_ov)
1392 {
1393         struct rte_sched_subport *s;
1394
1395         /* Check user parameters */
1396         if (port == NULL) {
1397                 RTE_LOG(ERR, SCHED,
1398                         "%s: Incorrect value for parameter port\n", __func__);
1399                 return -EINVAL;
1400         }
1401
1402         if (subport_id >= port->n_subports_per_port) {
1403                 RTE_LOG(ERR, SCHED,
1404                         "%s: Incorrect value for subport id\n", __func__);
1405                 return -EINVAL;
1406         }
1407
1408         if (stats == NULL) {
1409                 RTE_LOG(ERR, SCHED,
1410                         "%s: Incorrect value for parameter stats\n", __func__);
1411                 return -EINVAL;
1412         }
1413
1414         if (tc_ov == NULL) {
1415                 RTE_LOG(ERR, SCHED,
1416                         "%s: Incorrect value for tc_ov\n", __func__);
1417                 return -EINVAL;
1418         }
1419
1420         s = port->subports[subport_id];
1421
1422         /* Copy subport stats and clear */
1423         memcpy(stats, &s->stats, sizeof(struct rte_sched_subport_stats));
1424         memset(&s->stats, 0, sizeof(struct rte_sched_subport_stats));
1425
1426         /* Subport TC oversubscription status */
1427         *tc_ov = s->tc_ov;
1428
1429         return 0;
1430 }
1431
1432 int
1433 rte_sched_queue_read_stats(struct rte_sched_port *port,
1434         uint32_t queue_id,
1435         struct rte_sched_queue_stats *stats,
1436         uint16_t *qlen)
1437 {
1438         struct rte_sched_subport *s;
1439         struct rte_sched_queue *q;
1440         struct rte_sched_queue_extra *qe;
1441         uint32_t subport_id, subport_qmask, subport_qindex;
1442
1443         /* Check user parameters */
1444         if (port == NULL) {
1445                 RTE_LOG(ERR, SCHED,
1446                         "%s: Incorrect value for parameter port\n", __func__);
1447                 return -EINVAL;
1448         }
1449
1450         if (queue_id >= rte_sched_port_queues_per_port(port)) {
1451                 RTE_LOG(ERR, SCHED,
1452                         "%s: Incorrect value for queue id\n", __func__);
1453                 return -EINVAL;
1454         }
1455
1456         if (stats == NULL) {
1457                 RTE_LOG(ERR, SCHED,
1458                         "%s: Incorrect value for parameter stats\n", __func__);
1459                 return -EINVAL;
1460         }
1461
1462         if (qlen == NULL) {
1463                 RTE_LOG(ERR, SCHED,
1464                         "%s: Incorrect value for parameter qlen\n", __func__);
1465                 return -EINVAL;
1466         }
1467         subport_qmask = port->n_pipes_per_subport_log2 + 4;
1468         subport_id = (queue_id >> subport_qmask) & (port->n_subports_per_port - 1);
1469
1470         s = port->subports[subport_id];
1471         subport_qindex = ((1 << subport_qmask) - 1) & queue_id;
1472         q = s->queue + subport_qindex;
1473         qe = s->queue_extra + subport_qindex;
1474
1475         /* Copy queue stats and clear */
1476         memcpy(stats, &qe->stats, sizeof(struct rte_sched_queue_stats));
1477         memset(&qe->stats, 0, sizeof(struct rte_sched_queue_stats));
1478
1479         /* Queue length */
1480         *qlen = q->qw - q->qr;
1481
1482         return 0;
1483 }
1484
1485 #ifdef RTE_SCHED_DEBUG
1486
1487 static inline int
1488 rte_sched_port_queue_is_empty(struct rte_sched_subport *subport,
1489         uint32_t qindex)
1490 {
1491         struct rte_sched_queue *queue = subport->queue + qindex;
1492
1493         return queue->qr == queue->qw;
1494 }
1495
1496 #endif /* RTE_SCHED_DEBUG */
1497
1498 #ifdef RTE_SCHED_COLLECT_STATS
1499
1500 static inline void
1501 rte_sched_port_update_subport_stats(struct rte_sched_port *port,
1502         struct rte_sched_subport *subport,
1503         uint32_t qindex,
1504         struct rte_mbuf *pkt)
1505 {
1506         uint32_t tc_index = rte_sched_port_pipe_tc(port, qindex);
1507         uint32_t pkt_len = pkt->pkt_len;
1508
1509         subport->stats.n_pkts_tc[tc_index] += 1;
1510         subport->stats.n_bytes_tc[tc_index] += pkt_len;
1511 }
1512
1513 #ifdef RTE_SCHED_RED
1514 static inline void
1515 rte_sched_port_update_subport_stats_on_drop(struct rte_sched_port *port,
1516         struct rte_sched_subport *subport,
1517         uint32_t qindex,
1518         struct rte_mbuf *pkt,
1519         uint32_t red)
1520 #else
1521 static inline void
1522 rte_sched_port_update_subport_stats_on_drop(struct rte_sched_port *port,
1523         struct rte_sched_subport *subport,
1524         uint32_t qindex,
1525         struct rte_mbuf *pkt,
1526         __rte_unused uint32_t red)
1527 #endif
1528 {
1529         uint32_t tc_index = rte_sched_port_pipe_tc(port, qindex);
1530         uint32_t pkt_len = pkt->pkt_len;
1531
1532         subport->stats.n_pkts_tc_dropped[tc_index] += 1;
1533         subport->stats.n_bytes_tc_dropped[tc_index] += pkt_len;
1534 #ifdef RTE_SCHED_RED
1535         subport->stats.n_pkts_red_dropped[tc_index] += red;
1536 #endif
1537 }
1538
1539 static inline void
1540 rte_sched_port_update_queue_stats(struct rte_sched_subport *subport,
1541         uint32_t qindex,
1542         struct rte_mbuf *pkt)
1543 {
1544         struct rte_sched_queue_extra *qe = subport->queue_extra + qindex;
1545         uint32_t pkt_len = pkt->pkt_len;
1546
1547         qe->stats.n_pkts += 1;
1548         qe->stats.n_bytes += pkt_len;
1549 }
1550
1551 #ifdef RTE_SCHED_RED
1552 static inline void
1553 rte_sched_port_update_queue_stats_on_drop(struct rte_sched_subport *subport,
1554         uint32_t qindex,
1555         struct rte_mbuf *pkt,
1556         uint32_t red)
1557 #else
1558 static inline void
1559 rte_sched_port_update_queue_stats_on_drop(struct rte_sched_subport *subport,
1560         uint32_t qindex,
1561         struct rte_mbuf *pkt,
1562         __rte_unused uint32_t red)
1563 #endif
1564 {
1565         struct rte_sched_queue_extra *qe = subport->queue_extra + qindex;
1566         uint32_t pkt_len = pkt->pkt_len;
1567
1568         qe->stats.n_pkts_dropped += 1;
1569         qe->stats.n_bytes_dropped += pkt_len;
1570 #ifdef RTE_SCHED_RED
1571         qe->stats.n_pkts_red_dropped += red;
1572 #endif
1573 }
1574
1575 #endif /* RTE_SCHED_COLLECT_STATS */
1576
1577 #ifdef RTE_SCHED_RED
1578
1579 static inline int
1580 rte_sched_port_red_drop(struct rte_sched_port *port,
1581         struct rte_sched_subport *subport,
1582         struct rte_mbuf *pkt,
1583         uint32_t qindex,
1584         uint16_t qlen)
1585 {
1586         struct rte_sched_queue_extra *qe;
1587         struct rte_red_config *red_cfg;
1588         struct rte_red *red;
1589         uint32_t tc_index;
1590         enum rte_color color;
1591
1592         tc_index = rte_sched_port_pipe_tc(port, qindex);
1593         color = rte_sched_port_pkt_read_color(pkt);
1594         red_cfg = &subport->red_config[tc_index][color];
1595
1596         if ((red_cfg->min_th | red_cfg->max_th) == 0)
1597                 return 0;
1598
1599         qe = subport->queue_extra + qindex;
1600         red = &qe->red;
1601
1602         return rte_red_enqueue(red_cfg, red, qlen, port->time);
1603 }
1604
1605 static inline void
1606 rte_sched_port_set_queue_empty_timestamp(struct rte_sched_port *port,
1607         struct rte_sched_subport *subport, uint32_t qindex)
1608 {
1609         struct rte_sched_queue_extra *qe = subport->queue_extra + qindex;
1610         struct rte_red *red = &qe->red;
1611
1612         rte_red_mark_queue_empty(red, port->time);
1613 }
1614
1615 #else
1616
1617 static inline int rte_sched_port_red_drop(struct rte_sched_port *port __rte_unused,
1618         struct rte_sched_subport *subport __rte_unused,
1619         struct rte_mbuf *pkt __rte_unused,
1620         uint32_t qindex __rte_unused,
1621         uint16_t qlen __rte_unused)
1622 {
1623         return 0;
1624 }
1625
1626 #define rte_sched_port_set_queue_empty_timestamp(port, subport, qindex)
1627
1628 #endif /* RTE_SCHED_RED */
1629
1630 #ifdef RTE_SCHED_DEBUG
1631
1632 static inline void
1633 debug_check_queue_slab(struct rte_sched_subport *subport, uint32_t bmp_pos,
1634                        uint64_t bmp_slab)
1635 {
1636         uint64_t mask;
1637         uint32_t i, panic;
1638
1639         if (bmp_slab == 0)
1640                 rte_panic("Empty slab at position %u\n", bmp_pos);
1641
1642         panic = 0;
1643         for (i = 0, mask = 1; i < 64; i++, mask <<= 1) {
1644                 if (mask & bmp_slab) {
1645                         if (rte_sched_port_queue_is_empty(subport, bmp_pos + i)) {
1646                                 printf("Queue %u (slab offset %u) is empty\n", bmp_pos + i, i);
1647                                 panic = 1;
1648                         }
1649                 }
1650         }
1651
1652         if (panic)
1653                 rte_panic("Empty queues in slab 0x%" PRIx64 "starting at position %u\n",
1654                         bmp_slab, bmp_pos);
1655 }
1656
1657 #endif /* RTE_SCHED_DEBUG */
1658
1659 static inline struct rte_sched_subport *
1660 rte_sched_port_subport(struct rte_sched_port *port,
1661         struct rte_mbuf *pkt)
1662 {
1663         uint32_t queue_id = rte_mbuf_sched_queue_get(pkt);
1664         uint32_t subport_id = queue_id >> (port->n_pipes_per_subport_log2 + 4);
1665
1666         return port->subports[subport_id];
1667 }
1668
1669 static inline uint32_t
1670 rte_sched_port_enqueue_qptrs_prefetch0(struct rte_sched_subport *subport,
1671         struct rte_mbuf *pkt, uint32_t subport_qmask)
1672 {
1673         struct rte_sched_queue *q;
1674 #ifdef RTE_SCHED_COLLECT_STATS
1675         struct rte_sched_queue_extra *qe;
1676 #endif
1677         uint32_t qindex = rte_mbuf_sched_queue_get(pkt);
1678         uint32_t subport_queue_id = subport_qmask & qindex;
1679
1680         q = subport->queue + subport_queue_id;
1681         rte_prefetch0(q);
1682 #ifdef RTE_SCHED_COLLECT_STATS
1683         qe = subport->queue_extra + subport_queue_id;
1684         rte_prefetch0(qe);
1685 #endif
1686
1687         return subport_queue_id;
1688 }
1689
1690 static inline void
1691 rte_sched_port_enqueue_qwa_prefetch0(struct rte_sched_port *port,
1692         struct rte_sched_subport *subport,
1693         uint32_t qindex,
1694         struct rte_mbuf **qbase)
1695 {
1696         struct rte_sched_queue *q;
1697         struct rte_mbuf **q_qw;
1698         uint16_t qsize;
1699
1700         q = subport->queue + qindex;
1701         qsize = rte_sched_subport_pipe_qsize(port, subport, qindex);
1702         q_qw = qbase + (q->qw & (qsize - 1));
1703
1704         rte_prefetch0(q_qw);
1705         rte_bitmap_prefetch0(subport->bmp, qindex);
1706 }
1707
1708 static inline int
1709 rte_sched_port_enqueue_qwa(struct rte_sched_port *port,
1710         struct rte_sched_subport *subport,
1711         uint32_t qindex,
1712         struct rte_mbuf **qbase,
1713         struct rte_mbuf *pkt)
1714 {
1715         struct rte_sched_queue *q;
1716         uint16_t qsize;
1717         uint16_t qlen;
1718
1719         q = subport->queue + qindex;
1720         qsize = rte_sched_subport_pipe_qsize(port, subport, qindex);
1721         qlen = q->qw - q->qr;
1722
1723         /* Drop the packet (and update drop stats) when queue is full */
1724         if (unlikely(rte_sched_port_red_drop(port, subport, pkt, qindex, qlen) ||
1725                      (qlen >= qsize))) {
1726                 rte_pktmbuf_free(pkt);
1727 #ifdef RTE_SCHED_COLLECT_STATS
1728                 rte_sched_port_update_subport_stats_on_drop(port, subport,
1729                         qindex, pkt, qlen < qsize);
1730                 rte_sched_port_update_queue_stats_on_drop(subport, qindex, pkt,
1731                         qlen < qsize);
1732 #endif
1733                 return 0;
1734         }
1735
1736         /* Enqueue packet */
1737         qbase[q->qw & (qsize - 1)] = pkt;
1738         q->qw++;
1739
1740         /* Activate queue in the subport bitmap */
1741         rte_bitmap_set(subport->bmp, qindex);
1742
1743         /* Statistics */
1744 #ifdef RTE_SCHED_COLLECT_STATS
1745         rte_sched_port_update_subport_stats(port, subport, qindex, pkt);
1746         rte_sched_port_update_queue_stats(subport, qindex, pkt);
1747 #endif
1748
1749         return 1;
1750 }
1751
1752
1753 /*
1754  * The enqueue function implements a 4-level pipeline with each stage
1755  * processing two different packets. The purpose of using a pipeline
1756  * is to hide the latency of prefetching the data structures. The
1757  * naming convention is presented in the diagram below:
1758  *
1759  *   p00  _______   p10  _______   p20  _______   p30  _______
1760  * ----->|       |----->|       |----->|       |----->|       |----->
1761  *       |   0   |      |   1   |      |   2   |      |   3   |
1762  * ----->|_______|----->|_______|----->|_______|----->|_______|----->
1763  *   p01            p11            p21            p31
1764  *
1765  */
1766 int
1767 rte_sched_port_enqueue(struct rte_sched_port *port, struct rte_mbuf **pkts,
1768                        uint32_t n_pkts)
1769 {
1770         struct rte_mbuf *pkt00, *pkt01, *pkt10, *pkt11, *pkt20, *pkt21,
1771                 *pkt30, *pkt31, *pkt_last;
1772         struct rte_mbuf **q00_base, **q01_base, **q10_base, **q11_base,
1773                 **q20_base, **q21_base, **q30_base, **q31_base, **q_last_base;
1774         struct rte_sched_subport *subport00, *subport01, *subport10, *subport11,
1775                 *subport20, *subport21, *subport30, *subport31, *subport_last;
1776         uint32_t q00, q01, q10, q11, q20, q21, q30, q31, q_last;
1777         uint32_t r00, r01, r10, r11, r20, r21, r30, r31, r_last;
1778         uint32_t subport_qmask;
1779         uint32_t result, i;
1780
1781         result = 0;
1782         subport_qmask = (1 << (port->n_pipes_per_subport_log2 + 4)) - 1;
1783
1784         /*
1785          * Less then 6 input packets available, which is not enough to
1786          * feed the pipeline
1787          */
1788         if (unlikely(n_pkts < 6)) {
1789                 struct rte_sched_subport *subports[5];
1790                 struct rte_mbuf **q_base[5];
1791                 uint32_t q[5];
1792
1793                 /* Prefetch the mbuf structure of each packet */
1794                 for (i = 0; i < n_pkts; i++)
1795                         rte_prefetch0(pkts[i]);
1796
1797                 /* Prefetch the subport structure for each packet */
1798                 for (i = 0; i < n_pkts; i++)
1799                         subports[i] = rte_sched_port_subport(port, pkts[i]);
1800
1801                 /* Prefetch the queue structure for each queue */
1802                 for (i = 0; i < n_pkts; i++)
1803                         q[i] = rte_sched_port_enqueue_qptrs_prefetch0(subports[i],
1804                                         pkts[i], subport_qmask);
1805
1806                 /* Prefetch the write pointer location of each queue */
1807                 for (i = 0; i < n_pkts; i++) {
1808                         q_base[i] = rte_sched_subport_pipe_qbase(subports[i], q[i]);
1809                         rte_sched_port_enqueue_qwa_prefetch0(port, subports[i],
1810                                 q[i], q_base[i]);
1811                 }
1812
1813                 /* Write each packet to its queue */
1814                 for (i = 0; i < n_pkts; i++)
1815                         result += rte_sched_port_enqueue_qwa(port, subports[i],
1816                                                 q[i], q_base[i], pkts[i]);
1817
1818                 return result;
1819         }
1820
1821         /* Feed the first 3 stages of the pipeline (6 packets needed) */
1822         pkt20 = pkts[0];
1823         pkt21 = pkts[1];
1824         rte_prefetch0(pkt20);
1825         rte_prefetch0(pkt21);
1826
1827         pkt10 = pkts[2];
1828         pkt11 = pkts[3];
1829         rte_prefetch0(pkt10);
1830         rte_prefetch0(pkt11);
1831
1832         subport20 = rte_sched_port_subport(port, pkt20);
1833         subport21 = rte_sched_port_subport(port, pkt21);
1834         q20 = rte_sched_port_enqueue_qptrs_prefetch0(subport20,
1835                         pkt20, subport_qmask);
1836         q21 = rte_sched_port_enqueue_qptrs_prefetch0(subport21,
1837                         pkt21, subport_qmask);
1838
1839         pkt00 = pkts[4];
1840         pkt01 = pkts[5];
1841         rte_prefetch0(pkt00);
1842         rte_prefetch0(pkt01);
1843
1844         subport10 = rte_sched_port_subport(port, pkt10);
1845         subport11 = rte_sched_port_subport(port, pkt11);
1846         q10 = rte_sched_port_enqueue_qptrs_prefetch0(subport10,
1847                         pkt10, subport_qmask);
1848         q11 = rte_sched_port_enqueue_qptrs_prefetch0(subport11,
1849                         pkt11, subport_qmask);
1850
1851         q20_base = rte_sched_subport_pipe_qbase(subport20, q20);
1852         q21_base = rte_sched_subport_pipe_qbase(subport21, q21);
1853         rte_sched_port_enqueue_qwa_prefetch0(port, subport20, q20, q20_base);
1854         rte_sched_port_enqueue_qwa_prefetch0(port, subport21, q21, q21_base);
1855
1856         /* Run the pipeline */
1857         for (i = 6; i < (n_pkts & (~1)); i += 2) {
1858                 /* Propagate stage inputs */
1859                 pkt30 = pkt20;
1860                 pkt31 = pkt21;
1861                 pkt20 = pkt10;
1862                 pkt21 = pkt11;
1863                 pkt10 = pkt00;
1864                 pkt11 = pkt01;
1865                 q30 = q20;
1866                 q31 = q21;
1867                 q20 = q10;
1868                 q21 = q11;
1869                 subport30 = subport20;
1870                 subport31 = subport21;
1871                 subport20 = subport10;
1872                 subport21 = subport11;
1873                 q30_base = q20_base;
1874                 q31_base = q21_base;
1875
1876                 /* Stage 0: Get packets in */
1877                 pkt00 = pkts[i];
1878                 pkt01 = pkts[i + 1];
1879                 rte_prefetch0(pkt00);
1880                 rte_prefetch0(pkt01);
1881
1882                 /* Stage 1: Prefetch subport and queue structure storing queue pointers */
1883                 subport10 = rte_sched_port_subport(port, pkt10);
1884                 subport11 = rte_sched_port_subport(port, pkt11);
1885                 q10 = rte_sched_port_enqueue_qptrs_prefetch0(subport10,
1886                                 pkt10, subport_qmask);
1887                 q11 = rte_sched_port_enqueue_qptrs_prefetch0(subport11,
1888                                 pkt11, subport_qmask);
1889
1890                 /* Stage 2: Prefetch queue write location */
1891                 q20_base = rte_sched_subport_pipe_qbase(subport20, q20);
1892                 q21_base = rte_sched_subport_pipe_qbase(subport21, q21);
1893                 rte_sched_port_enqueue_qwa_prefetch0(port, subport20, q20, q20_base);
1894                 rte_sched_port_enqueue_qwa_prefetch0(port, subport21, q21, q21_base);
1895
1896                 /* Stage 3: Write packet to queue and activate queue */
1897                 r30 = rte_sched_port_enqueue_qwa(port, subport30,
1898                                 q30, q30_base, pkt30);
1899                 r31 = rte_sched_port_enqueue_qwa(port, subport31,
1900                                 q31, q31_base, pkt31);
1901                 result += r30 + r31;
1902         }
1903
1904         /*
1905          * Drain the pipeline (exactly 6 packets).
1906          * Handle the last packet in the case
1907          * of an odd number of input packets.
1908          */
1909         pkt_last = pkts[n_pkts - 1];
1910         rte_prefetch0(pkt_last);
1911
1912         subport00 = rte_sched_port_subport(port, pkt00);
1913         subport01 = rte_sched_port_subport(port, pkt01);
1914         q00 = rte_sched_port_enqueue_qptrs_prefetch0(subport00,
1915                         pkt00, subport_qmask);
1916         q01 = rte_sched_port_enqueue_qptrs_prefetch0(subport01,
1917                         pkt01, subport_qmask);
1918
1919         q10_base = rte_sched_subport_pipe_qbase(subport10, q10);
1920         q11_base = rte_sched_subport_pipe_qbase(subport11, q11);
1921         rte_sched_port_enqueue_qwa_prefetch0(port, subport10, q10, q10_base);
1922         rte_sched_port_enqueue_qwa_prefetch0(port, subport11, q11, q11_base);
1923
1924         r20 = rte_sched_port_enqueue_qwa(port, subport20,
1925                         q20, q20_base, pkt20);
1926         r21 = rte_sched_port_enqueue_qwa(port, subport21,
1927                         q21, q21_base, pkt21);
1928         result += r20 + r21;
1929
1930         subport_last = rte_sched_port_subport(port, pkt_last);
1931         q_last = rte_sched_port_enqueue_qptrs_prefetch0(subport_last,
1932                                 pkt_last, subport_qmask);
1933
1934         q00_base = rte_sched_subport_pipe_qbase(subport00, q00);
1935         q01_base = rte_sched_subport_pipe_qbase(subport01, q01);
1936         rte_sched_port_enqueue_qwa_prefetch0(port, subport00, q00, q00_base);
1937         rte_sched_port_enqueue_qwa_prefetch0(port, subport01, q01, q01_base);
1938
1939         r10 = rte_sched_port_enqueue_qwa(port, subport10, q10,
1940                         q10_base, pkt10);
1941         r11 = rte_sched_port_enqueue_qwa(port, subport11, q11,
1942                         q11_base, pkt11);
1943         result += r10 + r11;
1944
1945         q_last_base = rte_sched_subport_pipe_qbase(subport_last, q_last);
1946         rte_sched_port_enqueue_qwa_prefetch0(port, subport_last,
1947                 q_last, q_last_base);
1948
1949         r00 = rte_sched_port_enqueue_qwa(port, subport00, q00,
1950                         q00_base, pkt00);
1951         r01 = rte_sched_port_enqueue_qwa(port, subport01, q01,
1952                         q01_base, pkt01);
1953         result += r00 + r01;
1954
1955         if (n_pkts & 1) {
1956                 r_last = rte_sched_port_enqueue_qwa(port, subport_last,
1957                                         q_last, q_last_base, pkt_last);
1958                 result += r_last;
1959         }
1960
1961         return result;
1962 }
1963
1964 #ifndef RTE_SCHED_SUBPORT_TC_OV
1965
1966 static inline void
1967 grinder_credits_update(struct rte_sched_port *port,
1968         struct rte_sched_subport *subport, uint32_t pos)
1969 {
1970         struct rte_sched_grinder *grinder = subport->grinder + pos;
1971         struct rte_sched_pipe *pipe = grinder->pipe;
1972         struct rte_sched_pipe_profile *params = grinder->pipe_params;
1973         uint64_t n_periods;
1974         uint32_t i;
1975
1976         /* Subport TB */
1977         n_periods = (port->time - subport->tb_time) / subport->tb_period;
1978         subport->tb_credits += n_periods * subport->tb_credits_per_period;
1979         subport->tb_credits = RTE_MIN(subport->tb_credits, subport->tb_size);
1980         subport->tb_time += n_periods * subport->tb_period;
1981
1982         /* Pipe TB */
1983         n_periods = (port->time - pipe->tb_time) / params->tb_period;
1984         pipe->tb_credits += n_periods * params->tb_credits_per_period;
1985         pipe->tb_credits = RTE_MIN(pipe->tb_credits, params->tb_size);
1986         pipe->tb_time += n_periods * params->tb_period;
1987
1988         /* Subport TCs */
1989         if (unlikely(port->time >= subport->tc_time)) {
1990                 for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
1991                         subport->tc_credits[i] = subport->tc_credits_per_period[i];
1992
1993                 subport->tc_time = port->time + subport->tc_period;
1994         }
1995
1996         /* Pipe TCs */
1997         if (unlikely(port->time >= pipe->tc_time)) {
1998                 for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
1999                         pipe->tc_credits[i] = params->tc_credits_per_period[i];
2000
2001                 pipe->tc_time = port->time + params->tc_period;
2002         }
2003 }
2004
2005 #else
2006
2007 static inline uint64_t
2008 grinder_tc_ov_credits_update(struct rte_sched_port *port,
2009         struct rte_sched_subport *subport)
2010 {
2011         uint64_t tc_ov_consumption[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
2012         uint64_t tc_consumption = 0, tc_ov_consumption_max;
2013         uint64_t tc_ov_wm = subport->tc_ov_wm;
2014         uint32_t i;
2015
2016         if (subport->tc_ov == 0)
2017                 return subport->tc_ov_wm_max;
2018
2019         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASS_BE; i++) {
2020                 tc_ov_consumption[i] =
2021                         subport->tc_credits_per_period[i] - subport->tc_credits[i];
2022                 tc_consumption += tc_ov_consumption[i];
2023         }
2024
2025         tc_ov_consumption[RTE_SCHED_TRAFFIC_CLASS_BE] =
2026                 subport->tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASS_BE] -
2027                 subport->tc_credits[RTE_SCHED_TRAFFIC_CLASS_BE];
2028
2029         tc_ov_consumption_max =
2030                 subport->tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASS_BE] -
2031                         tc_consumption;
2032
2033         if (tc_ov_consumption[RTE_SCHED_TRAFFIC_CLASS_BE] >
2034                 (tc_ov_consumption_max - port->mtu)) {
2035                 tc_ov_wm  -= tc_ov_wm >> 7;
2036                 if (tc_ov_wm < subport->tc_ov_wm_min)
2037                         tc_ov_wm = subport->tc_ov_wm_min;
2038
2039                 return tc_ov_wm;
2040         }
2041
2042         tc_ov_wm += (tc_ov_wm >> 7) + 1;
2043         if (tc_ov_wm > subport->tc_ov_wm_max)
2044                 tc_ov_wm = subport->tc_ov_wm_max;
2045
2046         return tc_ov_wm;
2047 }
2048
2049 static inline void
2050 grinder_credits_update(struct rte_sched_port *port,
2051         struct rte_sched_subport *subport, uint32_t pos)
2052 {
2053         struct rte_sched_grinder *grinder = subport->grinder + pos;
2054         struct rte_sched_pipe *pipe = grinder->pipe;
2055         struct rte_sched_pipe_profile *params = grinder->pipe_params;
2056         uint64_t n_periods;
2057         uint32_t i;
2058
2059         /* Subport TB */
2060         n_periods = (port->time - subport->tb_time) / subport->tb_period;
2061         subport->tb_credits += n_periods * subport->tb_credits_per_period;
2062         subport->tb_credits = RTE_MIN(subport->tb_credits, subport->tb_size);
2063         subport->tb_time += n_periods * subport->tb_period;
2064
2065         /* Pipe TB */
2066         n_periods = (port->time - pipe->tb_time) / params->tb_period;
2067         pipe->tb_credits += n_periods * params->tb_credits_per_period;
2068         pipe->tb_credits = RTE_MIN(pipe->tb_credits, params->tb_size);
2069         pipe->tb_time += n_periods * params->tb_period;
2070
2071         /* Subport TCs */
2072         if (unlikely(port->time >= subport->tc_time)) {
2073                 subport->tc_ov_wm = grinder_tc_ov_credits_update(port, subport);
2074
2075                 for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
2076                         subport->tc_credits[i] = subport->tc_credits_per_period[i];
2077
2078                 subport->tc_time = port->time + subport->tc_period;
2079                 subport->tc_ov_period_id++;
2080         }
2081
2082         /* Pipe TCs */
2083         if (unlikely(port->time >= pipe->tc_time)) {
2084                 for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
2085                         pipe->tc_credits[i] = params->tc_credits_per_period[i];
2086                 pipe->tc_time = port->time + params->tc_period;
2087         }
2088
2089         /* Pipe TCs - Oversubscription */
2090         if (unlikely(pipe->tc_ov_period_id != subport->tc_ov_period_id)) {
2091                 pipe->tc_ov_credits = subport->tc_ov_wm * params->tc_ov_weight;
2092
2093                 pipe->tc_ov_period_id = subport->tc_ov_period_id;
2094         }
2095 }
2096
2097 #endif /* RTE_SCHED_TS_CREDITS_UPDATE, RTE_SCHED_SUBPORT_TC_OV */
2098
2099
2100 #ifndef RTE_SCHED_SUBPORT_TC_OV
2101
2102 static inline int
2103 grinder_credits_check(struct rte_sched_port *port,
2104         struct rte_sched_subport *subport, uint32_t pos)
2105 {
2106         struct rte_sched_grinder *grinder = subport->grinder + pos;
2107         struct rte_sched_pipe *pipe = grinder->pipe;
2108         struct rte_mbuf *pkt = grinder->pkt;
2109         uint32_t tc_index = grinder->tc_index;
2110         uint64_t pkt_len = pkt->pkt_len + port->frame_overhead;
2111         uint64_t subport_tb_credits = subport->tb_credits;
2112         uint64_t subport_tc_credits = subport->tc_credits[tc_index];
2113         uint64_t pipe_tb_credits = pipe->tb_credits;
2114         uint64_t pipe_tc_credits = pipe->tc_credits[tc_index];
2115         int enough_credits;
2116
2117         /* Check queue credits */
2118         enough_credits = (pkt_len <= subport_tb_credits) &&
2119                 (pkt_len <= subport_tc_credits) &&
2120                 (pkt_len <= pipe_tb_credits) &&
2121                 (pkt_len <= pipe_tc_credits);
2122
2123         if (!enough_credits)
2124                 return 0;
2125
2126         /* Update port credits */
2127         subport->tb_credits -= pkt_len;
2128         subport->tc_credits[tc_index] -= pkt_len;
2129         pipe->tb_credits -= pkt_len;
2130         pipe->tc_credits[tc_index] -= pkt_len;
2131
2132         return 1;
2133 }
2134
2135 #else
2136
2137 static inline int
2138 grinder_credits_check(struct rte_sched_port *port,
2139         struct rte_sched_subport *subport, uint32_t pos)
2140 {
2141         struct rte_sched_grinder *grinder = subport->grinder + pos;
2142         struct rte_sched_pipe *pipe = grinder->pipe;
2143         struct rte_mbuf *pkt = grinder->pkt;
2144         uint32_t tc_index = grinder->tc_index;
2145         uint64_t pkt_len = pkt->pkt_len + port->frame_overhead;
2146         uint64_t subport_tb_credits = subport->tb_credits;
2147         uint64_t subport_tc_credits = subport->tc_credits[tc_index];
2148         uint64_t pipe_tb_credits = pipe->tb_credits;
2149         uint64_t pipe_tc_credits = pipe->tc_credits[tc_index];
2150         uint64_t pipe_tc_ov_mask1[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
2151         uint64_t pipe_tc_ov_mask2[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE] = {0};
2152         uint64_t pipe_tc_ov_credits;
2153         uint32_t i;
2154         int enough_credits;
2155
2156         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
2157                 pipe_tc_ov_mask1[i] = ~0LLU;
2158
2159         pipe_tc_ov_mask1[RTE_SCHED_TRAFFIC_CLASS_BE] = pipe->tc_ov_credits;
2160         pipe_tc_ov_mask2[RTE_SCHED_TRAFFIC_CLASS_BE] = ~0LLU;
2161         pipe_tc_ov_credits = pipe_tc_ov_mask1[tc_index];
2162
2163         /* Check pipe and subport credits */
2164         enough_credits = (pkt_len <= subport_tb_credits) &&
2165                 (pkt_len <= subport_tc_credits) &&
2166                 (pkt_len <= pipe_tb_credits) &&
2167                 (pkt_len <= pipe_tc_credits) &&
2168                 (pkt_len <= pipe_tc_ov_credits);
2169
2170         if (!enough_credits)
2171                 return 0;
2172
2173         /* Update pipe and subport credits */
2174         subport->tb_credits -= pkt_len;
2175         subport->tc_credits[tc_index] -= pkt_len;
2176         pipe->tb_credits -= pkt_len;
2177         pipe->tc_credits[tc_index] -= pkt_len;
2178         pipe->tc_ov_credits -= pipe_tc_ov_mask2[tc_index] & pkt_len;
2179
2180         return 1;
2181 }
2182
2183 #endif /* RTE_SCHED_SUBPORT_TC_OV */
2184
2185
2186 static inline int
2187 grinder_schedule(struct rte_sched_port *port,
2188         struct rte_sched_subport *subport, uint32_t pos)
2189 {
2190         struct rte_sched_grinder *grinder = subport->grinder + pos;
2191         struct rte_sched_queue *queue = grinder->queue[grinder->qpos];
2192         struct rte_mbuf *pkt = grinder->pkt;
2193         uint32_t pkt_len = pkt->pkt_len + port->frame_overhead;
2194         uint32_t be_tc_active;
2195
2196         if (!grinder_credits_check(port, subport, pos))
2197                 return 0;
2198
2199         /* Advance port time */
2200         port->time += pkt_len;
2201
2202         /* Send packet */
2203         port->pkts_out[port->n_pkts_out++] = pkt;
2204         queue->qr++;
2205
2206         be_tc_active = (grinder->tc_index == RTE_SCHED_TRAFFIC_CLASS_BE) ? ~0x0 : 0x0;
2207         grinder->wrr_tokens[grinder->qpos] +=
2208                 (pkt_len * grinder->wrr_cost[grinder->qpos]) & be_tc_active;
2209
2210         if (queue->qr == queue->qw) {
2211                 uint32_t qindex = grinder->qindex[grinder->qpos];
2212
2213                 rte_bitmap_clear(subport->bmp, qindex);
2214                 grinder->qmask &= ~(1 << grinder->qpos);
2215                 if (be_tc_active)
2216                         grinder->wrr_mask[grinder->qpos] = 0;
2217                 rte_sched_port_set_queue_empty_timestamp(port, subport, qindex);
2218         }
2219
2220         /* Reset pipe loop detection */
2221         subport->pipe_loop = RTE_SCHED_PIPE_INVALID;
2222         grinder->productive = 1;
2223
2224         return 1;
2225 }
2226
2227 #ifdef SCHED_VECTOR_SSE4
2228
2229 static inline int
2230 grinder_pipe_exists(struct rte_sched_subport *subport, uint32_t base_pipe)
2231 {
2232         __m128i index = _mm_set1_epi32(base_pipe);
2233         __m128i pipes = _mm_load_si128((__m128i *)subport->grinder_base_bmp_pos);
2234         __m128i res = _mm_cmpeq_epi32(pipes, index);
2235
2236         pipes = _mm_load_si128((__m128i *)(subport->grinder_base_bmp_pos + 4));
2237         pipes = _mm_cmpeq_epi32(pipes, index);
2238         res = _mm_or_si128(res, pipes);
2239
2240         if (_mm_testz_si128(res, res))
2241                 return 0;
2242
2243         return 1;
2244 }
2245
2246 #elif defined(SCHED_VECTOR_NEON)
2247
2248 static inline int
2249 grinder_pipe_exists(struct rte_sched_subport *subport, uint32_t base_pipe)
2250 {
2251         uint32x4_t index, pipes;
2252         uint32_t *pos = (uint32_t *)subport->grinder_base_bmp_pos;
2253
2254         index = vmovq_n_u32(base_pipe);
2255         pipes = vld1q_u32(pos);
2256         if (!vminvq_u32(veorq_u32(pipes, index)))
2257                 return 1;
2258
2259         pipes = vld1q_u32(pos + 4);
2260         if (!vminvq_u32(veorq_u32(pipes, index)))
2261                 return 1;
2262
2263         return 0;
2264 }
2265
2266 #else
2267
2268 static inline int
2269 grinder_pipe_exists(struct rte_sched_subport *subport, uint32_t base_pipe)
2270 {
2271         uint32_t i;
2272
2273         for (i = 0; i < RTE_SCHED_PORT_N_GRINDERS; i++) {
2274                 if (subport->grinder_base_bmp_pos[i] == base_pipe)
2275                         return 1;
2276         }
2277
2278         return 0;
2279 }
2280
2281 #endif /* RTE_SCHED_OPTIMIZATIONS */
2282
2283 static inline void
2284 grinder_pcache_populate(struct rte_sched_subport *subport,
2285         uint32_t pos, uint32_t bmp_pos, uint64_t bmp_slab)
2286 {
2287         struct rte_sched_grinder *grinder = subport->grinder + pos;
2288         uint16_t w[4];
2289
2290         grinder->pcache_w = 0;
2291         grinder->pcache_r = 0;
2292
2293         w[0] = (uint16_t) bmp_slab;
2294         w[1] = (uint16_t) (bmp_slab >> 16);
2295         w[2] = (uint16_t) (bmp_slab >> 32);
2296         w[3] = (uint16_t) (bmp_slab >> 48);
2297
2298         grinder->pcache_qmask[grinder->pcache_w] = w[0];
2299         grinder->pcache_qindex[grinder->pcache_w] = bmp_pos;
2300         grinder->pcache_w += (w[0] != 0);
2301
2302         grinder->pcache_qmask[grinder->pcache_w] = w[1];
2303         grinder->pcache_qindex[grinder->pcache_w] = bmp_pos + 16;
2304         grinder->pcache_w += (w[1] != 0);
2305
2306         grinder->pcache_qmask[grinder->pcache_w] = w[2];
2307         grinder->pcache_qindex[grinder->pcache_w] = bmp_pos + 32;
2308         grinder->pcache_w += (w[2] != 0);
2309
2310         grinder->pcache_qmask[grinder->pcache_w] = w[3];
2311         grinder->pcache_qindex[grinder->pcache_w] = bmp_pos + 48;
2312         grinder->pcache_w += (w[3] != 0);
2313 }
2314
2315 static inline void
2316 grinder_tccache_populate(struct rte_sched_subport *subport,
2317         uint32_t pos, uint32_t qindex, uint16_t qmask)
2318 {
2319         struct rte_sched_grinder *grinder = subport->grinder + pos;
2320         uint8_t b, i;
2321
2322         grinder->tccache_w = 0;
2323         grinder->tccache_r = 0;
2324
2325         for (i = 0; i < RTE_SCHED_TRAFFIC_CLASS_BE; i++) {
2326                 b = (uint8_t) ((qmask >> i) & 0x1);
2327                 grinder->tccache_qmask[grinder->tccache_w] = b;
2328                 grinder->tccache_qindex[grinder->tccache_w] = qindex + i;
2329                 grinder->tccache_w += (b != 0);
2330         }
2331
2332         b = (uint8_t) (qmask >> (RTE_SCHED_TRAFFIC_CLASS_BE));
2333         grinder->tccache_qmask[grinder->tccache_w] = b;
2334         grinder->tccache_qindex[grinder->tccache_w] = qindex +
2335                 RTE_SCHED_TRAFFIC_CLASS_BE;
2336         grinder->tccache_w += (b != 0);
2337 }
2338
2339 static inline int
2340 grinder_next_tc(struct rte_sched_port *port,
2341         struct rte_sched_subport *subport, uint32_t pos)
2342 {
2343         struct rte_sched_grinder *grinder = subport->grinder + pos;
2344         struct rte_mbuf **qbase;
2345         uint32_t qindex;
2346         uint16_t qsize;
2347
2348         if (grinder->tccache_r == grinder->tccache_w)
2349                 return 0;
2350
2351         qindex = grinder->tccache_qindex[grinder->tccache_r];
2352         qbase = rte_sched_subport_pipe_qbase(subport, qindex);
2353         qsize = rte_sched_subport_pipe_qsize(port, subport, qindex);
2354
2355         grinder->tc_index = rte_sched_port_pipe_tc(port, qindex);
2356         grinder->qmask = grinder->tccache_qmask[grinder->tccache_r];
2357         grinder->qsize = qsize;
2358
2359         if (grinder->tc_index < RTE_SCHED_TRAFFIC_CLASS_BE) {
2360                 grinder->queue[0] = subport->queue + qindex;
2361                 grinder->qbase[0] = qbase;
2362                 grinder->qindex[0] = qindex;
2363                 grinder->tccache_r++;
2364
2365                 return 1;
2366         }
2367
2368         grinder->queue[0] = subport->queue + qindex;
2369         grinder->queue[1] = subport->queue + qindex + 1;
2370         grinder->queue[2] = subport->queue + qindex + 2;
2371         grinder->queue[3] = subport->queue + qindex + 3;
2372
2373         grinder->qbase[0] = qbase;
2374         grinder->qbase[1] = qbase + qsize;
2375         grinder->qbase[2] = qbase + 2 * qsize;
2376         grinder->qbase[3] = qbase + 3 * qsize;
2377
2378         grinder->qindex[0] = qindex;
2379         grinder->qindex[1] = qindex + 1;
2380         grinder->qindex[2] = qindex + 2;
2381         grinder->qindex[3] = qindex + 3;
2382
2383         grinder->tccache_r++;
2384         return 1;
2385 }
2386
2387 static inline int
2388 grinder_next_pipe(struct rte_sched_port *port,
2389         struct rte_sched_subport *subport, uint32_t pos)
2390 {
2391         struct rte_sched_grinder *grinder = subport->grinder + pos;
2392         uint32_t pipe_qindex;
2393         uint16_t pipe_qmask;
2394
2395         if (grinder->pcache_r < grinder->pcache_w) {
2396                 pipe_qmask = grinder->pcache_qmask[grinder->pcache_r];
2397                 pipe_qindex = grinder->pcache_qindex[grinder->pcache_r];
2398                 grinder->pcache_r++;
2399         } else {
2400                 uint64_t bmp_slab = 0;
2401                 uint32_t bmp_pos = 0;
2402
2403                 /* Get another non-empty pipe group */
2404                 if (unlikely(rte_bitmap_scan(subport->bmp, &bmp_pos, &bmp_slab) <= 0))
2405                         return 0;
2406
2407 #ifdef RTE_SCHED_DEBUG
2408                 debug_check_queue_slab(subport, bmp_pos, bmp_slab);
2409 #endif
2410
2411                 /* Return if pipe group already in one of the other grinders */
2412                 subport->grinder_base_bmp_pos[pos] = RTE_SCHED_BMP_POS_INVALID;
2413                 if (unlikely(grinder_pipe_exists(subport, bmp_pos)))
2414                         return 0;
2415
2416                 subport->grinder_base_bmp_pos[pos] = bmp_pos;
2417
2418                 /* Install new pipe group into grinder's pipe cache */
2419                 grinder_pcache_populate(subport, pos, bmp_pos, bmp_slab);
2420
2421                 pipe_qmask = grinder->pcache_qmask[0];
2422                 pipe_qindex = grinder->pcache_qindex[0];
2423                 grinder->pcache_r = 1;
2424         }
2425
2426         /* Install new pipe in the grinder */
2427         grinder->pindex = pipe_qindex >> 4;
2428         grinder->subport = subport;
2429         grinder->pipe = subport->pipe + grinder->pindex;
2430         grinder->pipe_params = NULL; /* to be set after the pipe structure is prefetched */
2431         grinder->productive = 0;
2432
2433         grinder_tccache_populate(subport, pos, pipe_qindex, pipe_qmask);
2434         grinder_next_tc(port, subport, pos);
2435
2436         /* Check for pipe exhaustion */
2437         if (grinder->pindex == subport->pipe_loop) {
2438                 subport->pipe_exhaustion = 1;
2439                 subport->pipe_loop = RTE_SCHED_PIPE_INVALID;
2440         }
2441
2442         return 1;
2443 }
2444
2445
2446 static inline void
2447 grinder_wrr_load(struct rte_sched_subport *subport, uint32_t pos)
2448 {
2449         struct rte_sched_grinder *grinder = subport->grinder + pos;
2450         struct rte_sched_pipe *pipe = grinder->pipe;
2451         struct rte_sched_pipe_profile *pipe_params = grinder->pipe_params;
2452         uint32_t qmask = grinder->qmask;
2453
2454         grinder->wrr_tokens[0] =
2455                 ((uint16_t) pipe->wrr_tokens[0]) << RTE_SCHED_WRR_SHIFT;
2456         grinder->wrr_tokens[1] =
2457                 ((uint16_t) pipe->wrr_tokens[1]) << RTE_SCHED_WRR_SHIFT;
2458         grinder->wrr_tokens[2] =
2459                 ((uint16_t) pipe->wrr_tokens[2]) << RTE_SCHED_WRR_SHIFT;
2460         grinder->wrr_tokens[3] =
2461                 ((uint16_t) pipe->wrr_tokens[3]) << RTE_SCHED_WRR_SHIFT;
2462
2463         grinder->wrr_mask[0] = (qmask & 0x1) * 0xFFFF;
2464         grinder->wrr_mask[1] = ((qmask >> 1) & 0x1) * 0xFFFF;
2465         grinder->wrr_mask[2] = ((qmask >> 2) & 0x1) * 0xFFFF;
2466         grinder->wrr_mask[3] = ((qmask >> 3) & 0x1) * 0xFFFF;
2467
2468         grinder->wrr_cost[0] = pipe_params->wrr_cost[0];
2469         grinder->wrr_cost[1] = pipe_params->wrr_cost[1];
2470         grinder->wrr_cost[2] = pipe_params->wrr_cost[2];
2471         grinder->wrr_cost[3] = pipe_params->wrr_cost[3];
2472 }
2473
2474 static inline void
2475 grinder_wrr_store(struct rte_sched_subport *subport, uint32_t pos)
2476 {
2477         struct rte_sched_grinder *grinder = subport->grinder + pos;
2478         struct rte_sched_pipe *pipe = grinder->pipe;
2479
2480         pipe->wrr_tokens[0] =
2481                         (grinder->wrr_tokens[0] & grinder->wrr_mask[0]) >>
2482                                 RTE_SCHED_WRR_SHIFT;
2483         pipe->wrr_tokens[1] =
2484                         (grinder->wrr_tokens[1] & grinder->wrr_mask[1]) >>
2485                                 RTE_SCHED_WRR_SHIFT;
2486         pipe->wrr_tokens[2] =
2487                         (grinder->wrr_tokens[2] & grinder->wrr_mask[2]) >>
2488                                 RTE_SCHED_WRR_SHIFT;
2489         pipe->wrr_tokens[3] =
2490                         (grinder->wrr_tokens[3] & grinder->wrr_mask[3]) >>
2491                                 RTE_SCHED_WRR_SHIFT;
2492 }
2493
2494 static inline void
2495 grinder_wrr(struct rte_sched_subport *subport, uint32_t pos)
2496 {
2497         struct rte_sched_grinder *grinder = subport->grinder + pos;
2498         uint16_t wrr_tokens_min;
2499
2500         grinder->wrr_tokens[0] |= ~grinder->wrr_mask[0];
2501         grinder->wrr_tokens[1] |= ~grinder->wrr_mask[1];
2502         grinder->wrr_tokens[2] |= ~grinder->wrr_mask[2];
2503         grinder->wrr_tokens[3] |= ~grinder->wrr_mask[3];
2504
2505         grinder->qpos = rte_min_pos_4_u16(grinder->wrr_tokens);
2506         wrr_tokens_min = grinder->wrr_tokens[grinder->qpos];
2507
2508         grinder->wrr_tokens[0] -= wrr_tokens_min;
2509         grinder->wrr_tokens[1] -= wrr_tokens_min;
2510         grinder->wrr_tokens[2] -= wrr_tokens_min;
2511         grinder->wrr_tokens[3] -= wrr_tokens_min;
2512 }
2513
2514
2515 #define grinder_evict(subport, pos)
2516
2517 static inline void
2518 grinder_prefetch_pipe(struct rte_sched_subport *subport, uint32_t pos)
2519 {
2520         struct rte_sched_grinder *grinder = subport->grinder + pos;
2521
2522         rte_prefetch0(grinder->pipe);
2523         rte_prefetch0(grinder->queue[0]);
2524 }
2525
2526 static inline void
2527 grinder_prefetch_tc_queue_arrays(struct rte_sched_subport *subport, uint32_t pos)
2528 {
2529         struct rte_sched_grinder *grinder = subport->grinder + pos;
2530         uint16_t qsize, qr[RTE_SCHED_MAX_QUEUES_PER_TC];
2531
2532         qsize = grinder->qsize;
2533         grinder->qpos = 0;
2534
2535         if (grinder->tc_index < RTE_SCHED_TRAFFIC_CLASS_BE) {
2536                 qr[0] = grinder->queue[0]->qr & (qsize - 1);
2537
2538                 rte_prefetch0(grinder->qbase[0] + qr[0]);
2539                 return;
2540         }
2541
2542         qr[0] = grinder->queue[0]->qr & (qsize - 1);
2543         qr[1] = grinder->queue[1]->qr & (qsize - 1);
2544         qr[2] = grinder->queue[2]->qr & (qsize - 1);
2545         qr[3] = grinder->queue[3]->qr & (qsize - 1);
2546
2547         rte_prefetch0(grinder->qbase[0] + qr[0]);
2548         rte_prefetch0(grinder->qbase[1] + qr[1]);
2549
2550         grinder_wrr_load(subport, pos);
2551         grinder_wrr(subport, pos);
2552
2553         rte_prefetch0(grinder->qbase[2] + qr[2]);
2554         rte_prefetch0(grinder->qbase[3] + qr[3]);
2555 }
2556
2557 static inline void
2558 grinder_prefetch_mbuf(struct rte_sched_subport *subport, uint32_t pos)
2559 {
2560         struct rte_sched_grinder *grinder = subport->grinder + pos;
2561         uint32_t qpos = grinder->qpos;
2562         struct rte_mbuf **qbase = grinder->qbase[qpos];
2563         uint16_t qsize = grinder->qsize;
2564         uint16_t qr = grinder->queue[qpos]->qr & (qsize - 1);
2565
2566         grinder->pkt = qbase[qr];
2567         rte_prefetch0(grinder->pkt);
2568
2569         if (unlikely((qr & 0x7) == 7)) {
2570                 uint16_t qr_next = (grinder->queue[qpos]->qr + 1) & (qsize - 1);
2571
2572                 rte_prefetch0(qbase + qr_next);
2573         }
2574 }
2575
2576 static inline uint32_t
2577 grinder_handle(struct rte_sched_port *port,
2578         struct rte_sched_subport *subport, uint32_t pos)
2579 {
2580         struct rte_sched_grinder *grinder = subport->grinder + pos;
2581
2582         switch (grinder->state) {
2583         case e_GRINDER_PREFETCH_PIPE:
2584         {
2585                 if (grinder_next_pipe(port, subport, pos)) {
2586                         grinder_prefetch_pipe(subport, pos);
2587                         subport->busy_grinders++;
2588
2589                         grinder->state = e_GRINDER_PREFETCH_TC_QUEUE_ARRAYS;
2590                         return 0;
2591                 }
2592
2593                 return 0;
2594         }
2595
2596         case e_GRINDER_PREFETCH_TC_QUEUE_ARRAYS:
2597         {
2598                 struct rte_sched_pipe *pipe = grinder->pipe;
2599
2600                 grinder->pipe_params = subport->pipe_profiles + pipe->profile;
2601                 grinder_prefetch_tc_queue_arrays(subport, pos);
2602                 grinder_credits_update(port, subport, pos);
2603
2604                 grinder->state = e_GRINDER_PREFETCH_MBUF;
2605                 return 0;
2606         }
2607
2608         case e_GRINDER_PREFETCH_MBUF:
2609         {
2610                 grinder_prefetch_mbuf(subport, pos);
2611
2612                 grinder->state = e_GRINDER_READ_MBUF;
2613                 return 0;
2614         }
2615
2616         case e_GRINDER_READ_MBUF:
2617         {
2618                 uint32_t wrr_active, result = 0;
2619
2620                 result = grinder_schedule(port, subport, pos);
2621
2622                 wrr_active = (grinder->tc_index == RTE_SCHED_TRAFFIC_CLASS_BE);
2623
2624                 /* Look for next packet within the same TC */
2625                 if (result && grinder->qmask) {
2626                         if (wrr_active)
2627                                 grinder_wrr(subport, pos);
2628
2629                         grinder_prefetch_mbuf(subport, pos);
2630
2631                         return 1;
2632                 }
2633
2634                 if (wrr_active)
2635                         grinder_wrr_store(subport, pos);
2636
2637                 /* Look for another active TC within same pipe */
2638                 if (grinder_next_tc(port, subport, pos)) {
2639                         grinder_prefetch_tc_queue_arrays(subport, pos);
2640
2641                         grinder->state = e_GRINDER_PREFETCH_MBUF;
2642                         return result;
2643                 }
2644
2645                 if (grinder->productive == 0 &&
2646                     subport->pipe_loop == RTE_SCHED_PIPE_INVALID)
2647                         subport->pipe_loop = grinder->pindex;
2648
2649                 grinder_evict(subport, pos);
2650
2651                 /* Look for another active pipe */
2652                 if (grinder_next_pipe(port, subport, pos)) {
2653                         grinder_prefetch_pipe(subport, pos);
2654
2655                         grinder->state = e_GRINDER_PREFETCH_TC_QUEUE_ARRAYS;
2656                         return result;
2657                 }
2658
2659                 /* No active pipe found */
2660                 subport->busy_grinders--;
2661
2662                 grinder->state = e_GRINDER_PREFETCH_PIPE;
2663                 return result;
2664         }
2665
2666         default:
2667                 rte_panic("Algorithmic error (invalid state)\n");
2668                 return 0;
2669         }
2670 }
2671
2672 static inline void
2673 rte_sched_port_time_resync(struct rte_sched_port *port)
2674 {
2675         uint64_t cycles = rte_get_tsc_cycles();
2676         uint64_t cycles_diff = cycles - port->time_cpu_cycles;
2677         uint64_t bytes_diff;
2678         uint32_t i;
2679
2680         /* Compute elapsed time in bytes */
2681         bytes_diff = rte_reciprocal_divide(cycles_diff << RTE_SCHED_TIME_SHIFT,
2682                                            port->inv_cycles_per_byte);
2683
2684         /* Advance port time */
2685         port->time_cpu_cycles = cycles;
2686         port->time_cpu_bytes += bytes_diff;
2687         if (port->time < port->time_cpu_bytes)
2688                 port->time = port->time_cpu_bytes;
2689
2690         /* Reset pipe loop detection */
2691         for (i = 0; i < port->n_subports_per_port; i++)
2692                 port->subports[i]->pipe_loop = RTE_SCHED_PIPE_INVALID;
2693 }
2694
2695 static inline int
2696 rte_sched_port_exceptions(struct rte_sched_subport *subport, int second_pass)
2697 {
2698         int exceptions;
2699
2700         /* Check if any exception flag is set */
2701         exceptions = (second_pass && subport->busy_grinders == 0) ||
2702                 (subport->pipe_exhaustion == 1);
2703
2704         /* Clear exception flags */
2705         subport->pipe_exhaustion = 0;
2706
2707         return exceptions;
2708 }
2709
2710 int
2711 rte_sched_port_dequeue(struct rte_sched_port *port, struct rte_mbuf **pkts, uint32_t n_pkts)
2712 {
2713         struct rte_sched_subport *subport;
2714         uint32_t subport_id = port->subport_id;
2715         uint32_t i, n_subports = 0, count;
2716
2717         port->pkts_out = pkts;
2718         port->n_pkts_out = 0;
2719
2720         rte_sched_port_time_resync(port);
2721
2722         /* Take each queue in the grinder one step further */
2723         for (i = 0, count = 0; ; i++)  {
2724                 subport = port->subports[subport_id];
2725
2726                 count += grinder_handle(port, subport,
2727                                 i & (RTE_SCHED_PORT_N_GRINDERS - 1));
2728
2729                 if (count == n_pkts) {
2730                         subport_id++;
2731
2732                         if (subport_id == port->n_subports_per_port)
2733                                 subport_id = 0;
2734
2735                         port->subport_id = subport_id;
2736                         break;
2737                 }
2738
2739                 if (rte_sched_port_exceptions(subport, i >= RTE_SCHED_PORT_N_GRINDERS)) {
2740                         i = 0;
2741                         subport_id++;
2742                         n_subports++;
2743                 }
2744
2745                 if (subport_id == port->n_subports_per_port)
2746                         subport_id = 0;
2747
2748                 if (n_subports == port->n_subports_per_port) {
2749                         port->subport_id = subport_id;
2750                         break;
2751                 }
2752         }
2753
2754         return count;
2755 }