3fad7688a20c8e3ac273969c8ad401df6d4ee774
[dpdk.git] / examples / performance-thread / common / lthread_sched.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 /*
35  * Some portions of this software is derived from the
36  * https://github.com/halayli/lthread which carrys the following license.
37  *
38  * Copyright (C) 2012, Hasan Alayli <halayli@gmail.com>
39  *
40  * Redistribution and use in source and binary forms, with or without
41  * modification, are permitted provided that the following conditions
42  * are met:
43  * 1. Redistributions of source code must retain the above copyright
44  *    notice, this list of conditions and the following disclaimer.
45  * 2. Redistributions in binary form must reproduce the above copyright
46  *    notice, this list of conditions and the following disclaimer in the
47  *    documentation and/or other materials provided with the distribution.
48  *
49  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  */
61
62
63 #define RTE_MEM 1
64
65 #include <stdio.h>
66 #include <stdlib.h>
67 #include <string.h>
68 #include <stdint.h>
69 #include <stddef.h>
70 #include <limits.h>
71 #include <inttypes.h>
72 #include <unistd.h>
73 #include <pthread.h>
74 #include <fcntl.h>
75 #include <sys/time.h>
76 #include <sys/mman.h>
77 #include <sched.h>
78
79 #include <rte_config.h>
80 #include <rte_prefetch.h>
81 #include <rte_per_lcore.h>
82 #include <rte_atomic.h>
83 #include <rte_atomic_64.h>
84 #include <rte_log.h>
85 #include <rte_common.h>
86 #include <rte_branch_prediction.h>
87
88 #include "lthread_api.h"
89 #include "lthread_int.h"
90 #include "lthread_sched.h"
91 #include "lthread_objcache.h"
92 #include "lthread_timer.h"
93 #include "lthread_mutex.h"
94 #include "lthread_cond.h"
95 #include "lthread_tls.h"
96 #include "lthread_diag.h"
97
98 /*
99  * This file implements the lthread scheduler
100  * The scheduler is the function lthread_run()
101  * This must be run as the main loop of an EAL thread.
102  *
103  * Currently once a scheduler is created it cannot be destroyed
104  * When a scheduler shuts down it is assumed that the application is terminating
105  */
106
107 static rte_atomic16_t num_schedulers;
108 static rte_atomic16_t active_schedulers;
109
110 /* one scheduler per lcore */
111 RTE_DEFINE_PER_LCORE(struct lthread_sched *, this_sched) = NULL;
112
113 struct lthread_sched *schedcore[LTHREAD_MAX_LCORES];
114
115 diag_callback diag_cb;
116
117 uint64_t diag_mask;
118
119
120 /* constructor */
121 void lthread_sched_ctor(void) __attribute__ ((constructor));
122 void lthread_sched_ctor(void)
123 {
124         memset(schedcore, 0, sizeof(schedcore));
125         rte_atomic16_init(&num_schedulers);
126         rte_atomic16_set(&num_schedulers, 1);
127         rte_atomic16_init(&active_schedulers);
128         rte_atomic16_set(&active_schedulers, 0);
129         diag_cb = NULL;
130 }
131
132
133 enum sched_alloc_phase {
134         SCHED_ALLOC_OK,
135         SCHED_ALLOC_QNODE_POOL,
136         SCHED_ALLOC_READY_QUEUE,
137         SCHED_ALLOC_PREADY_QUEUE,
138         SCHED_ALLOC_LTHREAD_CACHE,
139         SCHED_ALLOC_STACK_CACHE,
140         SCHED_ALLOC_PERLT_CACHE,
141         SCHED_ALLOC_TLS_CACHE,
142         SCHED_ALLOC_COND_CACHE,
143         SCHED_ALLOC_MUTEX_CACHE,
144 };
145
146 static int
147 _lthread_sched_alloc_resources(struct lthread_sched *new_sched)
148 {
149         int alloc_status;
150
151         do {
152                 /* Initialize per scheduler queue node pool */
153                 alloc_status = SCHED_ALLOC_QNODE_POOL;
154                 new_sched->qnode_pool =
155                         _qnode_pool_create("qnode pool", LTHREAD_PREALLOC);
156                 if (new_sched->qnode_pool == NULL)
157                         break;
158
159                 /* Initialize per scheduler local ready queue */
160                 alloc_status = SCHED_ALLOC_READY_QUEUE;
161                 new_sched->ready = _lthread_queue_create("ready queue");
162                 if (new_sched->ready == NULL)
163                         break;
164
165                 /* Initialize per scheduler local peer ready queue */
166                 alloc_status = SCHED_ALLOC_PREADY_QUEUE;
167                 new_sched->pready = _lthread_queue_create("pready queue");
168                 if (new_sched->pready == NULL)
169                         break;
170
171                 /* Initialize per scheduler local free lthread cache */
172                 alloc_status = SCHED_ALLOC_LTHREAD_CACHE;
173                 new_sched->lthread_cache =
174                         _lthread_objcache_create("lthread cache",
175                                                 sizeof(struct lthread),
176                                                 LTHREAD_PREALLOC);
177                 if (new_sched->lthread_cache == NULL)
178                         break;
179
180                 /* Initialize per scheduler local free stack cache */
181                 alloc_status = SCHED_ALLOC_STACK_CACHE;
182                 new_sched->stack_cache =
183                         _lthread_objcache_create("stack_cache",
184                                                 sizeof(struct lthread_stack),
185                                                 LTHREAD_PREALLOC);
186                 if (new_sched->stack_cache == NULL)
187                         break;
188
189                 /* Initialize per scheduler local free per lthread data cache */
190                 alloc_status = SCHED_ALLOC_PERLT_CACHE;
191                 new_sched->per_lthread_cache =
192                         _lthread_objcache_create("per_lt cache",
193                                                 RTE_PER_LTHREAD_SECTION_SIZE,
194                                                 LTHREAD_PREALLOC);
195                 if (new_sched->per_lthread_cache == NULL)
196                         break;
197
198                 /* Initialize per scheduler local free tls cache */
199                 alloc_status = SCHED_ALLOC_TLS_CACHE;
200                 new_sched->tls_cache =
201                         _lthread_objcache_create("TLS cache",
202                                                 sizeof(struct lthread_tls),
203                                                 LTHREAD_PREALLOC);
204                 if (new_sched->tls_cache == NULL)
205                         break;
206
207                 /* Initialize per scheduler local free cond var cache */
208                 alloc_status = SCHED_ALLOC_COND_CACHE;
209                 new_sched->cond_cache =
210                         _lthread_objcache_create("cond cache",
211                                                 sizeof(struct lthread_cond),
212                                                 LTHREAD_PREALLOC);
213                 if (new_sched->cond_cache == NULL)
214                         break;
215
216                 /* Initialize per scheduler local free mutex cache */
217                 alloc_status = SCHED_ALLOC_MUTEX_CACHE;
218                 new_sched->mutex_cache =
219                         _lthread_objcache_create("mutex cache",
220                                                 sizeof(struct lthread_mutex),
221                                                 LTHREAD_PREALLOC);
222                 if (new_sched->mutex_cache == NULL)
223                         break;
224
225                 alloc_status = SCHED_ALLOC_OK;
226         } while (0);
227
228         /* roll back on any failure */
229         switch (alloc_status) {
230         case SCHED_ALLOC_MUTEX_CACHE:
231                 _lthread_objcache_destroy(new_sched->cond_cache);
232                 /* fall through */
233         case SCHED_ALLOC_COND_CACHE:
234                 _lthread_objcache_destroy(new_sched->tls_cache);
235                 /* fall through */
236         case SCHED_ALLOC_TLS_CACHE:
237                 _lthread_objcache_destroy(new_sched->per_lthread_cache);
238                 /* fall through */
239         case SCHED_ALLOC_PERLT_CACHE:
240                 _lthread_objcache_destroy(new_sched->stack_cache);
241                 /* fall through */
242         case SCHED_ALLOC_STACK_CACHE:
243                 _lthread_objcache_destroy(new_sched->lthread_cache);
244                 /* fall through */
245         case SCHED_ALLOC_LTHREAD_CACHE:
246                 _lthread_queue_destroy(new_sched->pready);
247                 /* fall through */
248         case SCHED_ALLOC_PREADY_QUEUE:
249                 _lthread_queue_destroy(new_sched->ready);
250                 /* fall through */
251         case SCHED_ALLOC_READY_QUEUE:
252                 _qnode_pool_destroy(new_sched->qnode_pool);
253                 /* fall through */
254         case SCHED_ALLOC_QNODE_POOL:
255                 /* fall through */
256         case SCHED_ALLOC_OK:
257                 break;
258         }
259         return alloc_status;
260 }
261
262
263 /*
264  * Create a scheduler on the current lcore
265  */
266 struct lthread_sched *_lthread_sched_create(size_t stack_size)
267 {
268         int status;
269         struct lthread_sched *new_sched;
270         unsigned lcoreid = rte_lcore_id();
271
272         LTHREAD_ASSERT(stack_size <= LTHREAD_MAX_STACK_SIZE);
273
274         if (stack_size == 0)
275                 stack_size = LTHREAD_MAX_STACK_SIZE;
276
277         new_sched =
278              rte_calloc_socket(NULL, 1, sizeof(struct lthread_sched),
279                                 RTE_CACHE_LINE_SIZE,
280                                 rte_socket_id());
281         if (new_sched == NULL) {
282                 RTE_LOG(CRIT, LTHREAD,
283                         "Failed to allocate memory for scheduler\n");
284                 return NULL;
285         }
286
287         _lthread_key_pool_init();
288
289         new_sched->stack_size = stack_size;
290         new_sched->birth = rte_rdtsc();
291         THIS_SCHED = new_sched;
292
293         status = _lthread_sched_alloc_resources(new_sched);
294         if (status != SCHED_ALLOC_OK) {
295                 RTE_LOG(CRIT, LTHREAD,
296                         "Failed to allocate resources for scheduler code = %d\n",
297                         status);
298                 rte_free(new_sched);
299                 return NULL;
300         }
301
302         bzero(&new_sched->ctx, sizeof(struct ctx));
303
304         new_sched->lcore_id = lcoreid;
305
306         schedcore[lcoreid] = new_sched;
307
308         new_sched->run_flag = 1;
309
310         DIAG_EVENT(new_sched, LT_DIAG_SCHED_CREATE, rte_lcore_id(), 0);
311
312         rte_wmb();
313         return new_sched;
314 }
315
316 /*
317  * Set the number of schedulers in the system
318  */
319 int lthread_num_schedulers_set(int num)
320 {
321         rte_atomic16_set(&num_schedulers, num);
322         return (int)rte_atomic16_read(&num_schedulers);
323 }
324
325 /*
326  * Return the number of schedulers active
327  */
328 int lthread_active_schedulers(void)
329 {
330         return (int)rte_atomic16_read(&active_schedulers);
331 }
332
333
334 /**
335  * shutdown the scheduler running on the specified lcore
336  */
337 void lthread_scheduler_shutdown(unsigned lcoreid)
338 {
339         uint64_t coreid = (uint64_t) lcoreid;
340
341         if (coreid < LTHREAD_MAX_LCORES) {
342                 if (schedcore[coreid] != NULL)
343                         schedcore[coreid]->run_flag = 0;
344         }
345 }
346
347 /**
348  * shutdown all schedulers
349  */
350 void lthread_scheduler_shutdown_all(void)
351 {
352         uint64_t i;
353
354         /*
355          * give time for all schedulers to have started
356          * Note we use sched_yield() rather than pthread_yield() to allow
357          * for the possibility of a pthread wrapper on lthread_yield(),
358          * something that is not possible unless the scheduler is running.
359          */
360         while (rte_atomic16_read(&active_schedulers) <
361                rte_atomic16_read(&num_schedulers))
362                 sched_yield();
363
364         for (i = 0; i < LTHREAD_MAX_LCORES; i++) {
365                 if (schedcore[i] != NULL)
366                         schedcore[i]->run_flag = 0;
367         }
368 }
369
370 /*
371  * Resume a suspended lthread
372  */
373 static inline void
374 _lthread_resume(struct lthread *lt) __attribute__ ((always_inline));
375 static inline void _lthread_resume(struct lthread *lt)
376 {
377         struct lthread_sched *sched = THIS_SCHED;
378         struct lthread_stack *s;
379         uint64_t state = lt->state;
380 #if LTHREAD_DIAG
381         int init = 0;
382 #endif
383
384         sched->current_lthread = lt;
385
386         if (state & (BIT(ST_LT_CANCELLED) | BIT(ST_LT_EXITED))) {
387                 /* if detached we can free the thread now */
388                 if (state & BIT(ST_LT_DETACH)) {
389                         _lthread_free(lt);
390                         sched->current_lthread = NULL;
391                         return;
392                 }
393         }
394
395         if (state & BIT(ST_LT_INIT)) {
396                 /* first time this thread has been run */
397                 /* assign thread to this scheduler */
398                 lt->sched = THIS_SCHED;
399
400                 /* allocate stack */
401                 s = _stack_alloc();
402
403                 lt->stack_container = s;
404                 _lthread_set_stack(lt, s->stack, s->stack_size);
405
406                 /* allocate memory for TLS used by this thread */
407                 _lthread_tls_alloc(lt);
408
409                 lt->state = BIT(ST_LT_READY);
410 #if LTHREAD_DIAG
411                 init = 1;
412 #endif
413         }
414
415         DIAG_EVENT(lt, LT_DIAG_LTHREAD_RESUMED, init, lt);
416
417         /* switch to the new thread */
418         ctx_switch(&lt->ctx, &sched->ctx);
419
420         /* If posting to a queue that could be read by another lcore
421          * we defer the queue write till now to ensure the context has been
422          * saved before the other core tries to resume it
423          * This applies to blocking on mutex, cond, and to set_affinity
424          */
425         if (lt->pending_wr_queue != NULL) {
426                 struct lthread_queue *dest = lt->pending_wr_queue;
427
428                 lt->pending_wr_queue = NULL;
429
430                 /* queue the current thread to the specified queue */
431                 _lthread_queue_insert_mp(dest, lt);
432         }
433
434         sched->current_lthread = NULL;
435 }
436
437 /*
438  * Handle sleep timer expiry
439 */
440 void
441 _sched_timer_cb(struct rte_timer *tim, void *arg)
442 {
443         struct lthread *lt = (struct lthread *) arg;
444         uint64_t state = lt->state;
445
446         DIAG_EVENT(lt, LT_DIAG_LTHREAD_TMR_EXPIRED, &lt->tim, 0);
447
448         rte_timer_stop(tim);
449
450         if (lt->state & BIT(ST_LT_CANCELLED))
451                 (THIS_SCHED)->nb_blocked_threads--;
452
453         lt->state = state | BIT(ST_LT_EXPIRED);
454         _lthread_resume(lt);
455         lt->state = state & CLEARBIT(ST_LT_EXPIRED);
456 }
457
458
459
460 /*
461  * Returns 0 if there is a pending job in scheduler or 1 if done and can exit.
462  */
463 static inline int _lthread_sched_isdone(struct lthread_sched *sched)
464 {
465         return ((sched->run_flag == 0) &&
466                         (_lthread_queue_empty(sched->ready)) &&
467                         (_lthread_queue_empty(sched->pready)) &&
468                         (sched->nb_blocked_threads == 0));
469 }
470
471 /*
472  * Wait for all schedulers to start
473  */
474 static inline void _lthread_schedulers_sync_start(void)
475 {
476         rte_atomic16_inc(&active_schedulers);
477
478         /* wait for lthread schedulers
479          * Note we use sched_yield() rather than pthread_yield() to allow
480          * for the possibility of a pthread wrapper on lthread_yield(),
481          * something that is not possible unless the scheduler is running.
482          */
483         while (rte_atomic16_read(&active_schedulers) <
484                rte_atomic16_read(&num_schedulers))
485                 sched_yield();
486
487 }
488
489 /*
490  * Wait for all schedulers to stop
491  */
492 static inline void _lthread_schedulers_sync_stop(void)
493 {
494         rte_atomic16_dec(&active_schedulers);
495         rte_atomic16_dec(&num_schedulers);
496
497         /* wait for schedulers
498          * Note we use sched_yield() rather than pthread_yield() to allow
499          * for the possibility of a pthread wrapper on lthread_yield(),
500          * something that is not possible unless the scheduler is running.
501          */
502         while (rte_atomic16_read(&active_schedulers) > 0)
503                 sched_yield();
504
505 }
506
507
508 /*
509  * Run the lthread scheduler
510  * This loop is the heart of the system
511  */
512 void lthread_run(void)
513 {
514
515         struct lthread_sched *sched = THIS_SCHED;
516         struct lthread *lt = NULL;
517
518         RTE_LOG(INFO, LTHREAD,
519                 "starting scheduler %p on lcore %u phys core %u\n",
520                 sched, rte_lcore_id(),
521                 rte_lcore_index(rte_lcore_id()));
522
523         /* if more than one, wait for all schedulers to start */
524         _lthread_schedulers_sync_start();
525
526
527         /*
528          * This is the main scheduling loop
529          * So long as there are tasks in existence we run this loop.
530          * We check for:-
531          *   expired timers,
532          *   the local ready queue,
533          *   and the peer ready queue,
534          *
535          * and resume lthreads ad infinitum.
536          */
537         while (!_lthread_sched_isdone(sched)) {
538
539                 rte_timer_manage();
540
541                 lt = _lthread_queue_poll(sched->ready);
542                 if (lt != NULL)
543                         _lthread_resume(lt);
544                 lt = _lthread_queue_poll(sched->pready);
545                 if (lt != NULL)
546                         _lthread_resume(lt);
547         }
548
549
550         /* if more than one wait for all schedulers to stop */
551         _lthread_schedulers_sync_stop();
552
553         (THIS_SCHED) = NULL;
554
555         RTE_LOG(INFO, LTHREAD,
556                 "stopping scheduler %p on lcore %u phys core %u\n",
557                 sched, rte_lcore_id(),
558                 rte_lcore_index(rte_lcore_id()));
559         fflush(stdout);
560 }
561
562 /*
563  * Return the scheduler for this lcore
564  *
565  */
566 struct lthread_sched *_lthread_sched_get(int lcore_id)
567 {
568         if (lcore_id > LTHREAD_MAX_LCORES)
569                 return NULL;
570         return schedcore[lcore_id];
571 }
572
573 /*
574  * migrate the current thread to another scheduler running
575  * on the specified lcore.
576  */
577 int lthread_set_affinity(unsigned lcoreid)
578 {
579         struct lthread *lt = THIS_LTHREAD;
580         struct lthread_sched *dest_sched;
581
582         if (unlikely(lcoreid > LTHREAD_MAX_LCORES))
583                 return POSIX_ERRNO(EINVAL);
584
585
586         DIAG_EVENT(lt, LT_DIAG_LTHREAD_AFFINITY, lcoreid, 0);
587
588         dest_sched = schedcore[lcoreid];
589
590         if (unlikely(dest_sched == NULL))
591                 return POSIX_ERRNO(EINVAL);
592
593         if (likely(dest_sched != THIS_SCHED)) {
594                 lt->sched = dest_sched;
595                 lt->pending_wr_queue = dest_sched->pready;
596                 _affinitize();
597                 return 0;
598         }
599         return 0;
600 }