16d407e91704c8ae55cebcd428569021a2e8c0df
[dpdk.git] / lib / librte_eal / linuxapp / eal / eal_ivshmem.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #ifdef RTE_LIBRTE_IVSHMEM /* hide it from coverage */
35
36 #include <stdint.h>
37 #include <unistd.h>
38 #include <inttypes.h>
39 #include <sys/mman.h>
40 #include <sys/file.h>
41 #include <string.h>
42 #include <sys/queue.h>
43
44 #include <rte_log.h>
45 #include <rte_pci.h>
46 #include <rte_memory.h>
47 #include <rte_eal.h>
48 #include <rte_eal_memconfig.h>
49 #include <rte_string_fns.h>
50 #include <rte_errno.h>
51 #include <rte_ring.h>
52 #include <rte_mempool.h>
53 #include <rte_malloc.h>
54 #include <rte_common.h>
55 #include <rte_ivshmem.h>
56 #include <rte_tailq_elem.h>
57
58 #include "eal_internal_cfg.h"
59 #include "eal_private.h"
60
61 #define PCI_VENDOR_ID_IVSHMEM 0x1Af4
62 #define PCI_DEVICE_ID_IVSHMEM 0x1110
63
64 #define IVSHMEM_MAGIC 0x0BADC0DE
65 #define IVSHMEM_METADATA_SIZE 0x1000
66
67 #define IVSHMEM_RESOURCE_PATH "/sys/bus/pci/devices/%04x:%02x:%02x.%x/resource2"
68 #define IVSHMEM_CONFIG_PATH "/var/run/.%s_ivshmem_config"
69
70 #define PHYS 0x1
71 #define VIRT 0x2
72 #define IOREMAP 0x4
73 #define FULL (PHYS|VIRT|IOREMAP)
74
75 #define METADATA_SIZE_ALIGNED \
76         (RTE_ALIGN_CEIL(sizeof(struct rte_ivshmem_metadata),pagesz))
77
78 #define CONTAINS(x,y)\
79         (((y).addr_64 >= (x).addr_64) && ((y).addr_64 < (x).addr_64 + (x).len))
80
81 #define DIM(x) (sizeof(x)/sizeof(x[0]))
82
83 struct ivshmem_pci_device {
84         char path[PATH_MAX];
85         phys_addr_t ioremap_addr;
86 };
87
88 /* data type to store in config */
89 struct ivshmem_segment {
90         struct rte_ivshmem_metadata_entry entry;
91         uint64_t align;
92         char path[PATH_MAX];
93 };
94 struct ivshmem_shared_config {
95         struct ivshmem_segment segment[RTE_MAX_MEMSEG];
96         uint32_t segment_idx;
97         struct ivshmem_pci_device pci_devs[RTE_LIBRTE_IVSHMEM_MAX_PCI_DEVS];
98         uint32_t pci_devs_idx;
99 };
100 static struct ivshmem_shared_config * ivshmem_config;
101 static int memseg_idx;
102 static int pagesz;
103
104 /* Tailq heads to add rings to */
105 TAILQ_HEAD(rte_ring_list, rte_tailq_entry);
106
107 /*
108  * Utility functions
109  */
110
111 static int
112 is_ivshmem_device(struct rte_pci_device * dev)
113 {
114         return (dev->id.vendor_id == PCI_VENDOR_ID_IVSHMEM
115                         && dev->id.device_id == PCI_DEVICE_ID_IVSHMEM);
116 }
117
118 static void *
119 map_metadata(int fd, uint64_t len)
120 {
121         size_t metadata_len = sizeof(struct rte_ivshmem_metadata);
122         size_t aligned_len = METADATA_SIZE_ALIGNED;
123
124         return mmap(NULL, metadata_len, PROT_READ | PROT_WRITE,
125                         MAP_SHARED, fd, len - aligned_len);
126 }
127
128 static void
129 unmap_metadata(void * ptr)
130 {
131         munmap(ptr, sizeof(struct rte_ivshmem_metadata));
132 }
133
134 static int
135 has_ivshmem_metadata(int fd, uint64_t len)
136 {
137         struct rte_ivshmem_metadata metadata;
138         void * ptr;
139
140         ptr = map_metadata(fd, len);
141
142         if (ptr == MAP_FAILED)
143                 return -1;
144
145         metadata = *(struct rte_ivshmem_metadata*) (ptr);
146
147         unmap_metadata(ptr);
148
149         return metadata.magic_number == IVSHMEM_MAGIC;
150 }
151
152 static void
153 remove_segment(struct ivshmem_segment * ms, int len, int idx)
154 {
155         int i;
156
157         for (i = idx; i < len - 1; i++)
158                 memcpy(&ms[i], &ms[i+1], sizeof(struct ivshmem_segment));
159         memset(&ms[len-1], 0, sizeof(struct ivshmem_segment));
160 }
161
162 static int
163 overlap(const struct rte_memzone * mz1, const struct rte_memzone * mz2)
164 {
165         uint64_t start1, end1, start2, end2;
166         uint64_t p_start1, p_end1, p_start2, p_end2;
167         uint64_t i_start1, i_end1, i_start2, i_end2;
168         int result = 0;
169
170         /* gather virtual addresses */
171         start1 = mz1->addr_64;
172         end1 = mz1->addr_64 + mz1->len;
173         start2 = mz2->addr_64;
174         end2 = mz2->addr_64 + mz2->len;
175
176         /* gather physical addresses */
177         p_start1 = mz1->phys_addr;
178         p_end1 = mz1->phys_addr + mz1->len;
179         p_start2 = mz2->phys_addr;
180         p_end2 = mz2->phys_addr + mz2->len;
181
182         /* gather ioremap addresses */
183         i_start1 = mz1->ioremap_addr;
184         i_end1 = mz1->ioremap_addr + mz1->len;
185         i_start2 = mz2->ioremap_addr;
186         i_end2 = mz2->ioremap_addr + mz2->len;
187
188         /* check for overlap in virtual addresses */
189         if (start1 >= start2 && start1 < end2)
190                 result |= VIRT;
191         if (start2 >= start1 && start2 < end1)
192                 result |= VIRT;
193
194         /* check for overlap in physical addresses */
195         if (p_start1 >= p_start2 && p_start1 < p_end2)
196                 result |= PHYS;
197         if (p_start2 >= p_start1 && p_start2 < p_end1)
198                 result |= PHYS;
199
200         /* check for overlap in ioremap addresses */
201         if (i_start1 >= i_start2 && i_start1 < i_end2)
202                 result |= IOREMAP;
203         if (i_start2 >= i_start1 && i_start2 < i_end1)
204                 result |= IOREMAP;
205
206         return result;
207 }
208
209 static int
210 adjacent(const struct rte_memzone * mz1, const struct rte_memzone * mz2)
211 {
212         uint64_t start1, end1, start2, end2;
213         uint64_t p_start1, p_end1, p_start2, p_end2;
214         uint64_t i_start1, i_end1, i_start2, i_end2;
215         int result = 0;
216
217         /* gather virtual addresses */
218         start1 = mz1->addr_64;
219         end1 = mz1->addr_64 + mz1->len;
220         start2 = mz2->addr_64;
221         end2 = mz2->addr_64 + mz2->len;
222
223         /* gather physical addresses */
224         p_start1 = mz1->phys_addr;
225         p_end1 = mz1->phys_addr + mz1->len;
226         p_start2 = mz2->phys_addr;
227         p_end2 = mz2->phys_addr + mz2->len;
228
229         /* gather ioremap addresses */
230         i_start1 = mz1->ioremap_addr;
231         i_end1 = mz1->ioremap_addr + mz1->len;
232         i_start2 = mz2->ioremap_addr;
233         i_end2 = mz2->ioremap_addr + mz2->len;
234
235         /* check if segments are virtually adjacent */
236         if (start1 == end2)
237                 result |= VIRT;
238         if (start2 == end1)
239                 result |= VIRT;
240
241         /* check if segments are physically adjacent */
242         if (p_start1 == p_end2)
243                 result |= PHYS;
244         if (p_start2 == p_end1)
245                 result |= PHYS;
246
247         /* check if segments are ioremap-adjacent */
248         if (i_start1 == i_end2)
249                 result |= IOREMAP;
250         if (i_start2 == i_end1)
251                 result |= IOREMAP;
252
253         return result;
254 }
255
256 static int
257 has_adjacent_segments(struct ivshmem_segment * ms, int len)
258 {
259         int i, j, a;
260
261         for (i = 0; i < len; i++)
262                 for (j = i + 1; j < len; j++) {
263                         a = adjacent(&ms[i].entry.mz, &ms[j].entry.mz);
264
265                         /* check if segments are adjacent virtually and/or physically but
266                          * not ioremap (since that would indicate that they are from
267                          * different PCI devices and thus don't need to be concatenated.
268                          */
269                         if ((a & (VIRT|PHYS)) > 0 && (a & IOREMAP) == 0)
270                                 return 1;
271                 }
272         return 0;
273 }
274
275 static int
276 has_overlapping_segments(struct ivshmem_segment * ms, int len)
277 {
278         int i, j;
279
280         for (i = 0; i < len; i++)
281                 for (j = i + 1; j < len; j++)
282                         if (overlap(&ms[i].entry.mz, &ms[j].entry.mz))
283                                 return 1;
284         return 0;
285 }
286
287 static int
288 seg_compare(const void * a, const void * b)
289 {
290         const struct ivshmem_segment * s1 = (const struct ivshmem_segment*) a;
291         const struct ivshmem_segment * s2 = (const struct ivshmem_segment*) b;
292
293         /* move unallocated zones to the end */
294         if (s1->entry.mz.addr == NULL && s2->entry.mz.addr == NULL)
295                 return 0;
296         if (s1->entry.mz.addr == 0)
297                 return 1;
298         if (s2->entry.mz.addr == 0)
299                 return -1;
300
301         return s1->entry.mz.phys_addr > s2->entry.mz.phys_addr;
302 }
303
304 #ifdef RTE_LIBRTE_IVSHMEM_DEBUG
305 static void
306 entry_dump(struct rte_ivshmem_metadata_entry *e)
307 {
308         RTE_LOG(DEBUG, EAL, "\tvirt: %p-%p\n", e->mz.addr,
309                         RTE_PTR_ADD(e->mz.addr, e->mz.len));
310         RTE_LOG(DEBUG, EAL, "\tphys: 0x%" PRIx64 "-0x%" PRIx64 "\n",
311                         e->mz.phys_addr,
312                         e->mz.phys_addr + e->mz.len);
313         RTE_LOG(DEBUG, EAL, "\tio: 0x%" PRIx64 "-0x%" PRIx64 "\n",
314                         e->mz.ioremap_addr,
315                         e->mz.ioremap_addr + e->mz.len);
316         RTE_LOG(DEBUG, EAL, "\tlen: 0x%" PRIx64 "\n", e->mz.len);
317         RTE_LOG(DEBUG, EAL, "\toff: 0x%" PRIx64 "\n", e->offset);
318 }
319 #endif
320
321
322
323 /*
324  * Actual useful code
325  */
326
327 /* read through metadata mapped from the IVSHMEM device */
328 static int
329 read_metadata(char * path, int path_len, int fd, uint64_t flen)
330 {
331         struct rte_ivshmem_metadata metadata;
332         struct rte_ivshmem_metadata_entry * entry;
333         int idx, i;
334         void * ptr;
335
336         ptr = map_metadata(fd, flen);
337
338         if (ptr == MAP_FAILED)
339                 return -1;
340
341         metadata = *(struct rte_ivshmem_metadata*) (ptr);
342
343         unmap_metadata(ptr);
344
345         RTE_LOG(DEBUG, EAL, "Parsing metadata for \"%s\"\n", metadata.name);
346
347         idx = ivshmem_config->segment_idx;
348
349         for (i = 0; i < RTE_LIBRTE_IVSHMEM_MAX_ENTRIES &&
350                 idx <= RTE_MAX_MEMSEG; i++) {
351
352                 if (idx == RTE_MAX_MEMSEG) {
353                         RTE_LOG(ERR, EAL, "Not enough memory segments!\n");
354                         return -1;
355                 }
356
357                 entry = &metadata.entry[i];
358
359                 /* stop on uninitialized memzone */
360                 if (entry->mz.len == 0)
361                         break;
362
363                 /* copy metadata entry */
364                 memcpy(&ivshmem_config->segment[idx].entry, entry,
365                                 sizeof(struct rte_ivshmem_metadata_entry));
366
367                 /* copy path */
368                 snprintf(ivshmem_config->segment[idx].path, path_len, "%s", path);
369
370                 idx++;
371         }
372         ivshmem_config->segment_idx = idx;
373
374         return 0;
375 }
376
377 /* check through each segment and look for adjacent or overlapping ones. */
378 static int
379 cleanup_segments(struct ivshmem_segment * ms, int tbl_len)
380 {
381         struct ivshmem_segment * s, * tmp;
382         int i, j, concat, seg_adjacent, seg_overlapping;
383         uint64_t start1, start2, end1, end2, p_start1, p_start2, i_start1, i_start2;
384
385         qsort(ms, tbl_len, sizeof(struct ivshmem_segment),
386                                 seg_compare);
387
388         while (has_overlapping_segments(ms, tbl_len) ||
389                         has_adjacent_segments(ms, tbl_len)) {
390
391                 for (i = 0; i < tbl_len; i++) {
392                         s = &ms[i];
393
394                         concat = 0;
395
396                         for (j = i + 1; j < tbl_len; j++) {
397                                 tmp = &ms[j];
398
399                                 /* check if this segment is overlapping with existing segment,
400                                  * or is adjacent to existing segment */
401                                 seg_overlapping = overlap(&s->entry.mz, &tmp->entry.mz);
402                                 seg_adjacent = adjacent(&s->entry.mz, &tmp->entry.mz);
403
404                                 /* check if segments fully overlap or are fully adjacent */
405                                 if ((seg_adjacent == FULL) || (seg_overlapping == FULL)) {
406
407 #ifdef RTE_LIBRTE_IVSHMEM_DEBUG
408                                         RTE_LOG(DEBUG, EAL, "Concatenating segments\n");
409                                         RTE_LOG(DEBUG, EAL, "Segment %i:\n", i);
410                                         entry_dump(&s->entry);
411                                         RTE_LOG(DEBUG, EAL, "Segment %i:\n", j);
412                                         entry_dump(&tmp->entry);
413 #endif
414
415                                         start1 = s->entry.mz.addr_64;
416                                         start2 = tmp->entry.mz.addr_64;
417                                         p_start1 = s->entry.mz.phys_addr;
418                                         p_start2 = tmp->entry.mz.phys_addr;
419                                         i_start1 = s->entry.mz.ioremap_addr;
420                                         i_start2 = tmp->entry.mz.ioremap_addr;
421                                         end1 = s->entry.mz.addr_64 + s->entry.mz.len;
422                                         end2 = tmp->entry.mz.addr_64 + tmp->entry.mz.len;
423
424                                         /* settle for minimum start address and maximum length */
425                                         s->entry.mz.addr_64 = RTE_MIN(start1, start2);
426                                         s->entry.mz.phys_addr = RTE_MIN(p_start1, p_start2);
427                                         s->entry.mz.ioremap_addr = RTE_MIN(i_start1, i_start2);
428                                         s->entry.offset = RTE_MIN(s->entry.offset, tmp->entry.offset);
429                                         s->entry.mz.len = RTE_MAX(end1, end2) - s->entry.mz.addr_64;
430                                         concat = 1;
431
432 #ifdef RTE_LIBRTE_IVSHMEM_DEBUG
433                                         RTE_LOG(DEBUG, EAL, "Resulting segment:\n");
434                                         entry_dump(&s->entry);
435
436 #endif
437                                 }
438                                 /* if segments not fully overlap, we have an error condition.
439                                  * adjacent segments can coexist.
440                                  */
441                                 else if (seg_overlapping > 0) {
442                                         RTE_LOG(ERR, EAL, "Segments %i and %i overlap!\n", i, j);
443 #ifdef RTE_LIBRTE_IVSHMEM_DEBUG
444                                         RTE_LOG(DEBUG, EAL, "Segment %i:\n", i);
445                                         entry_dump(&s->entry);
446                                         RTE_LOG(DEBUG, EAL, "Segment %i:\n", j);
447                                         entry_dump(&tmp->entry);
448 #endif
449                                         return -1;
450                                 }
451                                 if (concat)
452                                         break;
453                         }
454                         /* if we concatenated, remove segment at j */
455                         if (concat) {
456                                 remove_segment(ms, tbl_len, j);
457                                 tbl_len--;
458                                 break;
459                         }
460                 }
461         }
462
463         return tbl_len;
464 }
465
466 static int
467 create_shared_config(void)
468 {
469         char path[PATH_MAX];
470         int fd;
471
472         /* build ivshmem config file path */
473         snprintf(path, sizeof(path), IVSHMEM_CONFIG_PATH,
474                         internal_config.hugefile_prefix);
475
476         fd = open(path, O_CREAT | O_RDWR, 0600);
477
478         if (fd < 0) {
479                 RTE_LOG(ERR, EAL, "Could not open %s: %s\n", path, strerror(errno));
480                 return -1;
481         }
482
483         /* try ex-locking first - if the file is locked, we have a problem */
484         if (flock(fd, LOCK_EX | LOCK_NB) == -1) {
485                 RTE_LOG(ERR, EAL, "Locking %s failed: %s\n", path, strerror(errno));
486                 close(fd);
487                 return -1;
488         }
489
490         if (ftruncate(fd, sizeof(struct ivshmem_shared_config)) < 0) {
491                 RTE_LOG(ERR, EAL, "ftruncate failed: %s\n", strerror(errno));
492                 return -1;
493         }
494
495         ivshmem_config = mmap(NULL, sizeof(struct ivshmem_shared_config),
496                         PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
497
498         if (ivshmem_config == MAP_FAILED)
499                 return -1;
500
501         memset(ivshmem_config, 0, sizeof(struct ivshmem_shared_config));
502
503         /* change the exclusive lock we got earlier to a shared lock */
504         if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
505                 RTE_LOG(ERR, EAL, "Locking %s failed: %s \n", path, strerror(errno));
506                 return -1;
507         }
508
509         close(fd);
510
511         return 0;
512 }
513
514 /* open shared config file and, if present, map the config.
515  * having no config file is not an error condition, as we later check if
516  * ivshmem_config is NULL (if it is, that means nothing was mapped). */
517 static int
518 open_shared_config(void)
519 {
520         char path[PATH_MAX];
521         int fd;
522
523         /* build ivshmem config file path */
524         snprintf(path, sizeof(path), IVSHMEM_CONFIG_PATH,
525                         internal_config.hugefile_prefix);
526
527         fd = open(path, O_RDONLY);
528
529         /* if the file doesn't exist, just return success */
530         if (fd < 0 && errno == ENOENT)
531                 return 0;
532         /* else we have an error condition */
533         else if (fd < 0) {
534                 RTE_LOG(ERR, EAL, "Could not open %s: %s\n",
535                                 path, strerror(errno));
536                 return -1;
537         }
538
539         /* try ex-locking first - if the lock *does* succeed, this means it's a
540          * stray config file, so it should be deleted.
541          */
542         if (flock(fd, LOCK_EX | LOCK_NB) != -1) {
543
544                 /* if we can't remove the file, something is wrong */
545                 if (unlink(path) < 0) {
546                         RTE_LOG(ERR, EAL, "Could not remove %s: %s\n", path,
547                                         strerror(errno));
548                         return -1;
549                 }
550
551                 /* release the lock */
552                 flock(fd, LOCK_UN);
553                 close(fd);
554
555                 /* return success as having a stray config file is equivalent to not
556                  * having config file at all.
557                  */
558                 return 0;
559         }
560
561         ivshmem_config = mmap(NULL, sizeof(struct ivshmem_shared_config),
562                         PROT_READ, MAP_SHARED, fd, 0);
563
564         if (ivshmem_config == MAP_FAILED)
565                 return -1;
566
567         /* place a shared lock on config file */
568         if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
569                 RTE_LOG(ERR, EAL, "Locking %s failed: %s \n", path, strerror(errno));
570                 return -1;
571         }
572
573         close(fd);
574
575         return 0;
576 }
577
578 /*
579  * This function does the following:
580  *
581  * 1) Builds a table of ivshmem_segments with proper offset alignment
582  * 2) Cleans up that table so that we don't have any overlapping or adjacent
583  *    memory segments
584  * 3) Creates memsegs from this table and maps them into memory.
585  */
586 static inline int
587 map_all_segments(void)
588 {
589         struct ivshmem_segment ms_tbl[RTE_MAX_MEMSEG];
590         struct ivshmem_pci_device * pci_dev;
591         struct rte_mem_config * mcfg;
592         struct ivshmem_segment * seg;
593         int fd, fd_zero;
594         unsigned i, j;
595         struct rte_memzone mz;
596         struct rte_memseg ms;
597         void * base_addr;
598         uint64_t align, len;
599         phys_addr_t ioremap_addr;
600
601         ioremap_addr = 0;
602
603         memset(ms_tbl, 0, sizeof(ms_tbl));
604         memset(&mz, 0, sizeof(struct rte_memzone));
605         memset(&ms, 0, sizeof(struct rte_memseg));
606
607         /* first, build a table of memsegs to map, to avoid failed mmaps due to
608          * overlaps
609          */
610         for (i = 0; i < ivshmem_config->segment_idx && i <= RTE_MAX_MEMSEG; i++) {
611                 if (i == RTE_MAX_MEMSEG) {
612                         RTE_LOG(ERR, EAL, "Too many segments requested!\n");
613                         return -1;
614                 }
615
616                 seg = &ivshmem_config->segment[i];
617
618                 /* copy segment to table */
619                 memcpy(&ms_tbl[i], seg, sizeof(struct ivshmem_segment));
620
621                 /* find ioremap addr */
622                 for (j = 0; j < DIM(ivshmem_config->pci_devs); j++) {
623                         pci_dev = &ivshmem_config->pci_devs[j];
624                         if (!strncmp(pci_dev->path, seg->path, sizeof(pci_dev->path))) {
625                                 ioremap_addr = pci_dev->ioremap_addr;
626                                 break;
627                         }
628                 }
629                 if (ioremap_addr == 0) {
630                         RTE_LOG(ERR, EAL, "Cannot find ioremap addr!\n");
631                         return -1;
632                 }
633
634                 /* work out alignments */
635                 align = seg->entry.mz.addr_64 -
636                                 RTE_ALIGN_FLOOR(seg->entry.mz.addr_64, 0x1000);
637                 len = RTE_ALIGN_CEIL(seg->entry.mz.len + align, 0x1000);
638
639                 /* save original alignments */
640                 ms_tbl[i].align = align;
641
642                 /* create a memory zone */
643                 mz.addr_64 = seg->entry.mz.addr_64 - align;
644                 mz.len = len;
645                 mz.hugepage_sz = seg->entry.mz.hugepage_sz;
646                 mz.phys_addr = seg->entry.mz.phys_addr - align;
647
648                 /* find true physical address */
649                 mz.ioremap_addr = ioremap_addr + seg->entry.offset - align;
650
651                 ms_tbl[i].entry.offset = seg->entry.offset - align;
652
653                 memcpy(&ms_tbl[i].entry.mz, &mz, sizeof(struct rte_memzone));
654         }
655
656         /* clean up the segments */
657         memseg_idx = cleanup_segments(ms_tbl, ivshmem_config->segment_idx);
658
659         if (memseg_idx < 0)
660                 return -1;
661
662         mcfg = rte_eal_get_configuration()->mem_config;
663
664         fd_zero = open("/dev/zero", O_RDWR);
665
666         if (fd_zero < 0) {
667                 RTE_LOG(ERR, EAL, "Cannot open /dev/zero: %s\n", strerror(errno));
668                 return -1;
669         }
670
671         /* create memsegs and put them into DPDK memory */
672         for (i = 0; i < (unsigned) memseg_idx; i++) {
673
674                 seg = &ms_tbl[i];
675
676                 ms.addr_64 = seg->entry.mz.addr_64;
677                 ms.hugepage_sz = seg->entry.mz.hugepage_sz;
678                 ms.len = seg->entry.mz.len;
679                 ms.nchannel = rte_memory_get_nchannel();
680                 ms.nrank = rte_memory_get_nrank();
681                 ms.phys_addr = seg->entry.mz.phys_addr;
682                 ms.ioremap_addr = seg->entry.mz.ioremap_addr;
683                 ms.socket_id = seg->entry.mz.socket_id;
684
685                 base_addr = mmap(ms.addr, ms.len,
686                                 PROT_READ | PROT_WRITE, MAP_PRIVATE, fd_zero, 0);
687
688                 if (base_addr == MAP_FAILED || base_addr != ms.addr) {
689                         RTE_LOG(ERR, EAL, "Cannot map /dev/zero!\n");
690                         return -1;
691                 }
692
693                 fd = open(seg->path, O_RDWR);
694
695                 if (fd < 0) {
696                         RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", seg->path,
697                                         strerror(errno));
698                         return -1;
699                 }
700
701                 munmap(ms.addr, ms.len);
702
703                 base_addr = mmap(ms.addr, ms.len,
704                                 PROT_READ | PROT_WRITE, MAP_SHARED, fd,
705                                 seg->entry.offset);
706
707
708                 if (base_addr == MAP_FAILED || base_addr != ms.addr) {
709                         RTE_LOG(ERR, EAL, "Cannot map segment into memory: "
710                                         "expected %p got %p (%s)\n", ms.addr, base_addr,
711                                         strerror(errno));
712                         return -1;
713                 }
714
715                 RTE_LOG(DEBUG, EAL, "Memory segment mapped: %p (len %" PRIx64 ") at "
716                                 "offset 0x%" PRIx64 "\n",
717                                 ms.addr, ms.len, seg->entry.offset);
718
719                 /* put the pointers back into their real positions using original
720                  * alignment */
721                 ms.addr_64 += seg->align;
722                 ms.phys_addr += seg->align;
723                 ms.ioremap_addr += seg->align;
724                 ms.len -= seg->align;
725
726                 /* at this point, the rest of DPDK memory is not initialized, so we
727                  * expect memsegs to be empty */
728                 memcpy(&mcfg->memseg[i], &ms,
729                                 sizeof(struct rte_memseg));
730                 memcpy(&mcfg->free_memseg[i], &ms,
731                                 sizeof(struct rte_memseg));
732
733
734                 /* adjust the free_memseg so that there's no free space left */
735                 mcfg->free_memseg[i].ioremap_addr += mcfg->free_memseg[i].len;
736                 mcfg->free_memseg[i].phys_addr += mcfg->free_memseg[i].len;
737                 mcfg->free_memseg[i].addr_64 += mcfg->free_memseg[i].len;
738                 mcfg->free_memseg[i].len = 0;
739
740                 close(fd);
741
742                 RTE_LOG(DEBUG, EAL, "IVSHMEM segment found, size: 0x%lx\n",
743                                 ms.len);
744         }
745
746         return 0;
747 }
748
749 /* this happens at a later stage, after general EAL memory initialization */
750 int
751 rte_eal_ivshmem_obj_init(void)
752 {
753         struct rte_ring_list* ring_list = NULL;
754         struct rte_mem_config * mcfg;
755         struct ivshmem_segment * seg;
756         struct rte_memzone * mz;
757         struct rte_ring * r;
758         struct rte_tailq_entry *te;
759         unsigned i, ms, idx;
760         uint64_t offset;
761
762         /* secondary process would not need any object discovery - it'll all
763          * already be in shared config */
764         if (rte_eal_process_type() != RTE_PROC_PRIMARY || ivshmem_config == NULL)
765                 return 0;
766
767         /* check that we have an initialised ring tail queue */
768         if ((ring_list =
769              RTE_TAILQ_LOOKUP_BY_IDX(RTE_TAILQ_RING, rte_ring_list)) == NULL) {
770                 RTE_LOG(ERR, EAL, "No rte_ring tailq found!\n");
771                 return -1;
772         }
773
774         mcfg = rte_eal_get_configuration()->mem_config;
775
776         /* create memzones */
777         for (i = 0; i < ivshmem_config->segment_idx && i <= RTE_MAX_MEMZONE; i++) {
778
779                 seg = &ivshmem_config->segment[i];
780
781                 /* add memzone */
782                 if (mcfg->memzone_idx == RTE_MAX_MEMZONE) {
783                         RTE_LOG(ERR, EAL, "No more memory zones available!\n");
784                         return -1;
785                 }
786
787                 idx = mcfg->memzone_idx;
788
789                 RTE_LOG(DEBUG, EAL, "Found memzone: '%s' at %p (len 0x%" PRIx64 ")\n",
790                                 seg->entry.mz.name, seg->entry.mz.addr, seg->entry.mz.len);
791
792                 memcpy(&mcfg->memzone[idx], &seg->entry.mz,
793                                 sizeof(struct rte_memzone));
794
795                 /* find ioremap address */
796                 for (ms = 0; ms <= RTE_MAX_MEMSEG; ms++) {
797                         if (ms == RTE_MAX_MEMSEG) {
798                                 RTE_LOG(ERR, EAL, "Physical address of segment not found!\n");
799                                 return -1;
800                         }
801                         if (CONTAINS(mcfg->memseg[ms], mcfg->memzone[idx])) {
802                                 offset = mcfg->memzone[idx].addr_64 -
803                                                                 mcfg->memseg[ms].addr_64;
804                                 mcfg->memzone[idx].ioremap_addr = mcfg->memseg[ms].ioremap_addr +
805                                                 offset;
806                                 break;
807                         }
808                 }
809
810                 mcfg->memzone_idx++;
811         }
812
813         rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
814
815         /* find rings */
816         for (i = 0; i < mcfg->memzone_idx; i++) {
817                 mz = &mcfg->memzone[i];
818
819                 /* check if memzone has a ring prefix */
820                 if (strncmp(mz->name, RTE_RING_MZ_PREFIX,
821                                 sizeof(RTE_RING_MZ_PREFIX) - 1) != 0)
822                         continue;
823
824                 r = (struct rte_ring*) (mz->addr_64);
825
826                 te = rte_zmalloc("RING_TAILQ_ENTRY", sizeof(*te), 0);
827                 if (te == NULL) {
828                         RTE_LOG(ERR, EAL, "Cannot allocate ring tailq entry!\n");
829                         return -1;
830                 }
831
832                 te->data = (void *) r;
833
834                 TAILQ_INSERT_TAIL(ring_list, te, next);
835
836                 RTE_LOG(DEBUG, EAL, "Found ring: '%s' at %p\n", r->name, mz->addr);
837         }
838         rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
839
840 #ifdef RTE_LIBRTE_IVSHMEM_DEBUG
841         rte_memzone_dump(stdout);
842         rte_ring_list_dump(stdout);
843 #endif
844
845         return 0;
846 }
847
848 /* initialize ivshmem structures */
849 int rte_eal_ivshmem_init(void)
850 {
851         struct rte_pci_device * dev;
852         struct rte_pci_resource * res;
853         int fd, ret;
854         char path[PATH_MAX];
855
856         /* initialize everything to 0 */
857         memset(path, 0, sizeof(path));
858         ivshmem_config = NULL;
859
860         pagesz = getpagesize();
861
862         RTE_LOG(DEBUG, EAL, "Searching for IVSHMEM devices...\n");
863
864         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
865
866                 if (open_shared_config() < 0) {
867                         RTE_LOG(ERR, EAL, "Could not open IVSHMEM config!\n");
868                         return -1;
869                 }
870         }
871         else {
872
873                 TAILQ_FOREACH(dev, &pci_device_list, next) {
874
875                         if (is_ivshmem_device(dev)) {
876
877                                 /* IVSHMEM memory is always on BAR2 */
878                                 res = &dev->mem_resource[2];
879
880                                 /* if we don't have a BAR2 */
881                                 if (res->len == 0)
882                                         continue;
883
884                                 /* construct pci device path */
885                                 snprintf(path, sizeof(path), IVSHMEM_RESOURCE_PATH,
886                                                 dev->addr.domain, dev->addr.bus, dev->addr.devid,
887                                                 dev->addr.function);
888
889                                 /* try to find memseg */
890                                 fd = open(path, O_RDWR);
891                                 if (fd < 0) {
892                                         RTE_LOG(ERR, EAL, "Could not open %s\n", path);
893                                         return -1;
894                                 }
895
896                                 /* check if it's a DPDK IVSHMEM device */
897                                 ret = has_ivshmem_metadata(fd, res->len);
898
899                                 /* is DPDK device */
900                                 if (ret == 1) {
901
902                                         /* config file creation is deferred until the first
903                                          * DPDK device is found. then, it has to be created
904                                          * only once. */
905                                         if (ivshmem_config == NULL &&
906                                                         create_shared_config() < 0) {
907                                                 RTE_LOG(ERR, EAL, "Could not create IVSHMEM config!\n");
908                                                 close(fd);
909                                                 return -1;
910                                         }
911
912                                         if (read_metadata(path, sizeof(path), fd, res->len) < 0) {
913                                                 RTE_LOG(ERR, EAL, "Could not read metadata from"
914                                                                 " device %02x:%02x.%x!\n", dev->addr.bus,
915                                                                 dev->addr.devid, dev->addr.function);
916                                                 close(fd);
917                                                 return -1;
918                                         }
919
920                                         if (ivshmem_config->pci_devs_idx == RTE_LIBRTE_IVSHMEM_MAX_PCI_DEVS) {
921                                                 RTE_LOG(WARNING, EAL,
922                                                                 "IVSHMEM PCI device limit exceeded. Increase "
923                                                                 "CONFIG_RTE_LIBRTE_IVSHMEM_MAX_PCI_DEVS  in "
924                                                                 "your config file.\n");
925                                                 break;
926                                         }
927
928                                         RTE_LOG(INFO, EAL, "Found IVSHMEM device %02x:%02x.%x\n",
929                                                         dev->addr.bus, dev->addr.devid, dev->addr.function);
930
931                                         ivshmem_config->pci_devs[ivshmem_config->pci_devs_idx].ioremap_addr = res->phys_addr;
932                                         snprintf(ivshmem_config->pci_devs[ivshmem_config->pci_devs_idx].path,
933                                                         sizeof(ivshmem_config->pci_devs[ivshmem_config->pci_devs_idx].path),
934                                                         "%s", path);
935
936                                         ivshmem_config->pci_devs_idx++;
937                                 }
938                                 /* failed to read */
939                                 else if (ret < 0) {
940                                         RTE_LOG(ERR, EAL, "Could not read IVSHMEM device: %s\n",
941                                                         strerror(errno));
942                                         close(fd);
943                                         return -1;
944                                 }
945                                 /* not a DPDK device */
946                                 else
947                                         RTE_LOG(DEBUG, EAL, "Skipping non-DPDK IVSHMEM device\n");
948
949                                 /* close the BAR fd */
950                                 close(fd);
951                         }
952                 }
953         }
954
955         /* ivshmem_config is not NULL only if config was created and/or mapped */
956         if (ivshmem_config) {
957                 if (map_all_segments() < 0) {
958                         RTE_LOG(ERR, EAL, "Mapping IVSHMEM segments failed!\n");
959                         return -1;
960                 }
961         }
962         else {
963                 RTE_LOG(DEBUG, EAL, "No IVSHMEM configuration found! \n");
964         }
965
966         return 0;
967 }
968
969 #endif