abc15f9e55af26b00ea860e25644d16374e33440
[dpdk.git] / lib / librte_eal / linuxapp / eal / eal_ivshmem.c
1 /*-
2  *   BSD LICENSE
3  * 
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  * 
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  * 
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  * 
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #ifdef RTE_LIBRTE_IVSHMEM /* hide it from coverage */
35
36 #include <stdint.h>
37 #include <unistd.h>
38 #include <inttypes.h>
39 #include <sys/mman.h>
40 #include <sys/file.h>
41 #include <string.h>
42 #include <sys/queue.h>
43
44 #include <rte_log.h>
45 #include <rte_pci.h>
46 #include <rte_memory.h>
47 #include <rte_eal.h>
48 #include <rte_eal_memconfig.h>
49 #include <rte_string_fns.h>
50 #include <rte_errno.h>
51 #include <rte_ring.h>
52 #include <rte_mempool.h>
53 #include <rte_common.h>
54 #include <rte_ivshmem.h>
55 #include <rte_tailq_elem.h>
56
57 #include "eal_internal_cfg.h"
58 #include "eal_private.h"
59
60 #define PCI_VENDOR_ID_IVSHMEM 0x1Af4
61 #define PCI_DEVICE_ID_IVSHMEM 0x1110
62
63 #define IVSHMEM_MAGIC 0x0BADC0DE
64 #define IVSHMEM_METADATA_SIZE 0x1000
65
66 #define IVSHMEM_RESOURCE_PATH "/sys/bus/pci/devices/%04x:%02x:%02x.%x/resource2"
67 #define IVSHMEM_CONFIG_PATH "/var/run/.%s_ivshmem_config"
68
69 #define PHYS 0x1
70 #define VIRT 0x2
71 #define IOREMAP 0x4
72 #define FULL (PHYS|VIRT|IOREMAP)
73
74 #define METADATA_SIZE_ALIGNED \
75         (RTE_ALIGN_CEIL(sizeof(struct rte_ivshmem_metadata),pagesz))
76
77 #define CONTAINS(x,y)\
78         (((y).addr_64 >= (x).addr_64) && ((y).addr_64 < (x).addr_64 + (x).len))
79
80 #define DIM(x) (sizeof(x)/sizeof(x[0]))
81
82 struct ivshmem_pci_device {
83         char path[PATH_MAX];
84         phys_addr_t ioremap_addr;
85 };
86
87 /* data type to store in config */
88 struct ivshmem_segment {
89         struct rte_ivshmem_metadata_entry entry;
90         uint64_t align;
91         char path[PATH_MAX];
92 };
93 struct ivshmem_shared_config {
94         struct ivshmem_segment segment[RTE_MAX_MEMSEG];
95         uint32_t segment_idx;
96         struct ivshmem_pci_device pci_devs[RTE_LIBRTE_IVSHMEM_MAX_PCI_DEVS];
97         uint32_t pci_devs_idx;
98 };
99 static struct ivshmem_shared_config * ivshmem_config;
100 static int memseg_idx;
101 static int pagesz;
102
103 /* Tailq heads to add rings to */
104 TAILQ_HEAD(rte_ring_list, rte_ring);
105
106 /*
107  * Utility functions
108  */
109
110 static int
111 is_ivshmem_device(struct rte_pci_device * dev)
112 {
113         return (dev->id.vendor_id == PCI_VENDOR_ID_IVSHMEM
114                         && dev->id.device_id == PCI_DEVICE_ID_IVSHMEM);
115 }
116
117 static void *
118 map_metadata(int fd, uint64_t len)
119 {
120         size_t metadata_len = sizeof(struct rte_ivshmem_metadata);
121         size_t aligned_len = METADATA_SIZE_ALIGNED;
122
123         return mmap(NULL, metadata_len, PROT_READ | PROT_WRITE,
124                         MAP_SHARED, fd, len - aligned_len);
125 }
126
127 static void
128 unmap_metadata(void * ptr)
129 {
130         munmap(ptr, sizeof(struct rte_ivshmem_metadata));
131 }
132
133 static int
134 has_ivshmem_metadata(int fd, uint64_t len)
135 {
136         struct rte_ivshmem_metadata metadata;
137         void * ptr;
138
139         ptr = map_metadata(fd, len);
140
141         if (ptr == MAP_FAILED)
142                 return -1;
143
144         metadata = *(struct rte_ivshmem_metadata*) (ptr);
145
146         unmap_metadata(ptr);
147
148         return metadata.magic_number == IVSHMEM_MAGIC;
149 }
150
151 static void
152 remove_segment(struct ivshmem_segment * ms, int len, int idx)
153 {
154         int i;
155
156         for (i = idx; i < len - 1; i++)
157                 memcpy(&ms[i], &ms[i+1], sizeof(struct ivshmem_segment));
158         memset(&ms[len-1], 0, sizeof(struct ivshmem_segment));
159 }
160
161 static int
162 overlap(const struct rte_memzone * mz1, const struct rte_memzone * mz2)
163 {
164         uint64_t start1, end1, start2, end2;
165         uint64_t p_start1, p_end1, p_start2, p_end2;
166         uint64_t i_start1, i_end1, i_start2, i_end2;
167         int result = 0;
168
169         /* gather virtual addresses */
170         start1 = mz1->addr_64;
171         end1 = mz1->addr_64 + mz1->len;
172         start2 = mz2->addr_64;
173         end2 = mz2->addr_64 + mz2->len;
174
175         /* gather physical addresses */
176         p_start1 = mz1->phys_addr;
177         p_end1 = mz1->phys_addr + mz1->len;
178         p_start2 = mz2->phys_addr;
179         p_end2 = mz2->phys_addr + mz2->len;
180
181         /* gather ioremap addresses */
182         i_start1 = mz1->ioremap_addr;
183         i_end1 = mz1->ioremap_addr + mz1->len;
184         i_start2 = mz2->ioremap_addr;
185         i_end2 = mz2->ioremap_addr + mz2->len;
186
187         /* check for overlap in virtual addresses */
188         if (start1 >= start2 && start1 < end2)
189                 result |= VIRT;
190         if (start2 >= start1 && start2 < end1)
191                 result |= VIRT;
192
193         /* check for overlap in physical addresses */
194         if (p_start1 >= p_start2 && p_start1 < p_end2)
195                 result |= PHYS;
196         if (p_start2 >= p_start1 && p_start2 < p_end1)
197                 result |= PHYS;
198
199         /* check for overlap in ioremap addresses */
200         if (i_start1 >= i_start2 && i_start1 < i_end2)
201                 result |= IOREMAP;
202         if (i_start2 >= i_start1 && i_start2 < i_end1)
203                 result |= IOREMAP;
204
205         return result;
206 }
207
208 static int
209 adjacent(const struct rte_memzone * mz1, const struct rte_memzone * mz2)
210 {
211         uint64_t start1, end1, start2, end2;
212         uint64_t p_start1, p_end1, p_start2, p_end2;
213         uint64_t i_start1, i_end1, i_start2, i_end2;
214         int result = 0;
215
216         /* gather virtual addresses */
217         start1 = mz1->addr_64;
218         end1 = mz1->addr_64 + mz1->len;
219         start2 = mz2->addr_64;
220         end2 = mz2->addr_64 + mz2->len;
221
222         /* gather physical addresses */
223         p_start1 = mz1->phys_addr;
224         p_end1 = mz1->phys_addr + mz1->len;
225         p_start2 = mz2->phys_addr;
226         p_end2 = mz2->phys_addr + mz2->len;
227
228         /* gather ioremap addresses */
229         i_start1 = mz1->ioremap_addr;
230         i_end1 = mz1->ioremap_addr + mz1->len;
231         i_start2 = mz2->ioremap_addr;
232         i_end2 = mz2->ioremap_addr + mz2->len;
233
234         /* check if segments are virtually adjacent */
235         if (start1 == end2)
236                 result |= VIRT;
237         if (start2 == end1)
238                 result |= VIRT;
239
240         /* check if segments are physically adjacent */
241         if (p_start1 == p_end2)
242                 result |= PHYS;
243         if (p_start2 == p_end1)
244                 result |= PHYS;
245
246         /* check if segments are ioremap-adjacent */
247         if (i_start1 == i_end2)
248                 result |= IOREMAP;
249         if (i_start2 == i_end1)
250                 result |= IOREMAP;
251
252         return result;
253 }
254
255 static int
256 has_adjacent_segments(struct ivshmem_segment * ms, int len)
257 {
258         int i, j, a;
259
260         for (i = 0; i < len; i++)
261                 for (j = i + 1; j < len; j++) {
262                         a = adjacent(&ms[i].entry.mz, &ms[j].entry.mz);
263
264                         /* check if segments are adjacent virtually and/or physically but
265                          * not ioremap (since that would indicate that they are from
266                          * different PCI devices and thus don't need to be concatenated.
267                          */
268                         if ((a & (VIRT|PHYS)) > 0 && (a & IOREMAP) == 0)
269                                 return 1;
270                 }
271         return 0;
272 }
273
274 static int
275 has_overlapping_segments(struct ivshmem_segment * ms, int len)
276 {
277         int i, j;
278
279         for (i = 0; i < len; i++)
280                 for (j = i + 1; j < len; j++)
281                         if (overlap(&ms[i].entry.mz, &ms[j].entry.mz))
282                                 return 1;
283         return 0;
284 }
285
286 static int
287 seg_compare(const void * a, const void * b)
288 {
289         const struct ivshmem_segment * s1 = (const struct ivshmem_segment*) a;
290         const struct ivshmem_segment * s2 = (const struct ivshmem_segment*) b;
291
292         /* move unallocated zones to the end */
293         if (s1->entry.mz.addr == NULL && s2->entry.mz.addr == NULL)
294                 return 0;
295         if (s1->entry.mz.addr == 0)
296                 return 1;
297         if (s2->entry.mz.addr == 0)
298                 return -1;
299
300         return s1->entry.mz.phys_addr > s2->entry.mz.phys_addr;
301 }
302
303 #ifdef RTE_LIBRTE_IVSHMEM_DEBUG
304 static void
305 entry_dump(struct rte_ivshmem_metadata_entry *e)
306 {
307         RTE_LOG(DEBUG, EAL, "\tvirt: %p-%p\n", e->mz.addr,
308                         RTE_PTR_ADD(e->mz.addr, e->mz.len));
309         RTE_LOG(DEBUG, EAL, "\tphys: 0x%" PRIx64 "-0x%" PRIx64 "\n",
310                         e->mz.phys_addr,
311                         e->mz.phys_addr + e->mz.len);
312         RTE_LOG(DEBUG, EAL, "\tio: 0x%" PRIx64 "-0x%" PRIx64 "\n",
313                         e->mz.ioremap_addr,
314                         e->mz.ioremap_addr + e->mz.len);
315         RTE_LOG(DEBUG, EAL, "\tlen: 0x%" PRIx64 "\n", e->mz.len);
316         RTE_LOG(DEBUG, EAL, "\toff: 0x%" PRIx64 "\n", e->offset);
317 }
318 #endif
319
320
321
322 /*
323  * Actual useful code
324  */
325
326 /* read through metadata mapped from the IVSHMEM device */
327 static int
328 read_metadata(char * path, int path_len, int fd, uint64_t flen)
329 {
330         struct rte_ivshmem_metadata metadata;
331         struct rte_ivshmem_metadata_entry * entry;
332         int idx, i;
333         void * ptr;
334
335         ptr = map_metadata(fd, flen);
336
337         if (ptr == MAP_FAILED)
338                 return -1;
339
340         metadata = *(struct rte_ivshmem_metadata*) (ptr);
341
342         unmap_metadata(ptr);
343
344         RTE_LOG(DEBUG, EAL, "Parsing metadata for \"%s\"\n", metadata.name);
345
346         idx = ivshmem_config->segment_idx;
347
348         for (i = 0; i < RTE_LIBRTE_IVSHMEM_MAX_ENTRIES &&
349                 idx <= RTE_MAX_MEMSEG; i++) {
350
351                 if (idx == RTE_MAX_MEMSEG) {
352                         RTE_LOG(ERR, EAL, "Not enough memory segments!\n");
353                         return -1;
354                 }
355
356                 entry = &metadata.entry[i];
357
358                 /* stop on uninitialized memzone */
359                 if (entry->mz.len == 0)
360                         break;
361
362                 /* copy metadata entry */
363                 memcpy(&ivshmem_config->segment[idx].entry, entry,
364                                 sizeof(struct rte_ivshmem_metadata_entry));
365
366                 /* copy path */
367                 rte_snprintf(ivshmem_config->segment[idx].path, path_len, "%s", path);
368
369                 idx++;
370         }
371         ivshmem_config->segment_idx = idx;
372
373         return 0;
374 }
375
376 /* check through each segment and look for adjacent or overlapping ones. */
377 static int
378 cleanup_segments(struct ivshmem_segment * ms, int tbl_len)
379 {
380         struct ivshmem_segment * s, * tmp;
381         int i, j, concat, seg_adjacent, seg_overlapping;
382         uint64_t start1, start2, end1, end2, p_start1, p_start2, i_start1, i_start2;
383
384         qsort(ms, tbl_len, sizeof(struct ivshmem_segment),
385                                 seg_compare);
386
387         while (has_overlapping_segments(ms, tbl_len) ||
388                         has_adjacent_segments(ms, tbl_len)) {
389
390                 for (i = 0; i < tbl_len; i++) {
391                         s = &ms[i];
392
393                         concat = 0;
394
395                         for (j = i + 1; j < tbl_len; j++) {
396                                 tmp = &ms[j];
397
398                                 /* check if this segment is overlapping with existing segment,
399                                  * or is adjacent to existing segment */
400                                 seg_overlapping = overlap(&s->entry.mz, &tmp->entry.mz);
401                                 seg_adjacent = adjacent(&s->entry.mz, &tmp->entry.mz);
402
403                                 /* check if segments fully overlap or are fully adjacent */
404                                 if ((seg_adjacent == FULL) || (seg_overlapping == FULL)) {
405
406 #ifdef RTE_LIBRTE_IVSHMEM_DEBUG
407                                         RTE_LOG(DEBUG, EAL, "Concatenating segments\n");
408                                         RTE_LOG(DEBUG, EAL, "Segment %i:\n", i);
409                                         entry_dump(&s->entry);
410                                         RTE_LOG(DEBUG, EAL, "Segment %i:\n", j);
411                                         entry_dump(&tmp->entry);
412 #endif
413
414                                         start1 = s->entry.mz.addr_64;
415                                         start2 = tmp->entry.mz.addr_64;
416                                         p_start1 = s->entry.mz.phys_addr;
417                                         p_start2 = tmp->entry.mz.phys_addr;
418                                         i_start1 = s->entry.mz.ioremap_addr;
419                                         i_start2 = tmp->entry.mz.ioremap_addr;
420                                         end1 = s->entry.mz.addr_64 + s->entry.mz.len;
421                                         end2 = tmp->entry.mz.addr_64 + tmp->entry.mz.len;
422
423                                         /* settle for minimum start address and maximum length */
424                                         s->entry.mz.addr_64 = RTE_MIN(start1, start2);
425                                         s->entry.mz.phys_addr = RTE_MIN(p_start1, p_start2);
426                                         s->entry.mz.ioremap_addr = RTE_MIN(i_start1, i_start2);
427                                         s->entry.offset = RTE_MIN(s->entry.offset, tmp->entry.offset);
428                                         s->entry.mz.len = RTE_MAX(end1, end2) - s->entry.mz.addr_64;
429                                         concat = 1;
430
431 #ifdef RTE_LIBRTE_IVSHMEM_DEBUG
432                                         RTE_LOG(DEBUG, EAL, "Resulting segment:\n");
433                                         entry_dump(&s->entry);
434
435 #endif
436                                 }
437                                 /* if segments not fully overlap, we have an error condition.
438                                  * adjacent segments can coexist.
439                                  */
440                                 else if (seg_overlapping > 0) {
441                                         RTE_LOG(ERR, EAL, "Segments %i and %i overlap!\n", i, j);
442 #ifdef RTE_LIBRTE_IVSHMEM_DEBUG
443                                         RTE_LOG(DEBUG, EAL, "Segment %i:\n", i);
444                                         entry_dump(&s->entry);
445                                         RTE_LOG(DEBUG, EAL, "Segment %i:\n", j);
446                                         entry_dump(&tmp->entry);
447 #endif
448                                         return -1;
449                                 }
450                                 if (concat)
451                                         break;
452                         }
453                         /* if we concatenated, remove segment at j */
454                         if (concat) {
455                                 remove_segment(ms, tbl_len, j);
456                                 tbl_len--;
457                                 break;
458                         }
459                 }
460         }
461
462         return tbl_len;
463 }
464
465 static int
466 create_shared_config(void)
467 {
468         char path[PATH_MAX];
469         int fd;
470
471         /* build ivshmem config file path */
472         rte_snprintf(path, sizeof(path), IVSHMEM_CONFIG_PATH,
473                         internal_config.hugefile_prefix);
474
475         fd = open(path, O_CREAT | O_RDWR, 0600);
476
477         if (fd < 0) {
478                 RTE_LOG(ERR, EAL, "Could not open %s: %s\n", path, strerror(errno));
479                 return -1;
480         }
481
482         /* try ex-locking first - if the file is locked, we have a problem */
483         if (flock(fd, LOCK_EX | LOCK_NB) == -1) {
484                 RTE_LOG(ERR, EAL, "Locking %s failed: %s\n", path, strerror(errno));
485                 close(fd);
486                 return -1;
487         }
488
489         if (ftruncate(fd, sizeof(struct ivshmem_shared_config)) < 0) {
490                 RTE_LOG(ERR, EAL, "ftruncate failed: %s\n", strerror(errno));
491                 return -1;
492         }
493
494         ivshmem_config = mmap(NULL, sizeof(struct ivshmem_shared_config),
495                         PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
496
497         if (ivshmem_config == MAP_FAILED)
498                 return -1;
499
500         memset(ivshmem_config, 0, sizeof(struct ivshmem_shared_config));
501
502         /* change the exclusive lock we got earlier to a shared lock */
503         if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
504                 RTE_LOG(ERR, EAL, "Locking %s failed: %s \n", path, strerror(errno));
505                 return -1;
506         }
507
508         close(fd);
509
510         return 0;
511 }
512
513 /* open shared config file and, if present, map the config.
514  * having no config file is not an error condition, as we later check if
515  * ivshmem_config is NULL (if it is, that means nothing was mapped). */
516 static int
517 open_shared_config(void)
518 {
519         char path[PATH_MAX];
520         int fd;
521
522         /* build ivshmem config file path */
523         rte_snprintf(path, sizeof(path), IVSHMEM_CONFIG_PATH,
524                         internal_config.hugefile_prefix);
525
526         fd = open(path, O_RDONLY);
527
528         /* if the file doesn't exist, just return success */
529         if (fd < 0 && errno == ENOENT)
530                 return 0;
531         /* else we have an error condition */
532         else if (fd < 0) {
533                 RTE_LOG(ERR, EAL, "Could not open %s: %s\n",
534                                 path, strerror(errno));
535                 return -1;
536         }
537
538         /* try ex-locking first - if the lock *does* succeed, this means it's a
539          * stray config file, so it should be deleted.
540          */
541         if (flock(fd, LOCK_EX | LOCK_NB) != -1) {
542
543                 /* if we can't remove the file, something is wrong */
544                 if (unlink(path) < 0) {
545                         RTE_LOG(ERR, EAL, "Could not remove %s: %s\n", path,
546                                         strerror(errno));
547                         return -1;
548                 }
549
550                 /* release the lock */
551                 flock(fd, LOCK_UN);
552                 close(fd);
553
554                 /* return success as having a stray config file is equivalent to not
555                  * having config file at all.
556                  */
557                 return 0;
558         }
559
560         ivshmem_config = mmap(NULL, sizeof(struct ivshmem_shared_config),
561                         PROT_READ, MAP_SHARED, fd, 0);
562
563         if (ivshmem_config == MAP_FAILED)
564                 return -1;
565
566         /* place a shared lock on config file */
567         if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
568                 RTE_LOG(ERR, EAL, "Locking %s failed: %s \n", path, strerror(errno));
569                 return -1;
570         }
571
572         close(fd);
573
574         return 0;
575 }
576
577 /*
578  * This function does the following:
579  *
580  * 1) Builds a table of ivshmem_segments with proper offset alignment
581  * 2) Cleans up that table so that we don't have any overlapping or adjacent
582  *    memory segments
583  * 3) Creates memsegs from this table and maps them into memory.
584  */
585 static inline int
586 map_all_segments(void)
587 {
588         struct ivshmem_segment ms_tbl[RTE_MAX_MEMSEG];
589         struct ivshmem_pci_device * pci_dev; 
590         struct rte_mem_config * mcfg;
591         struct ivshmem_segment * seg;
592         int fd, fd_zero;
593         unsigned i, j;
594         struct rte_memzone mz;
595         struct rte_memseg ms;
596         void * base_addr;
597         uint64_t align, len;
598         phys_addr_t ioremap_addr;
599
600         ioremap_addr = 0;
601
602         memset(ms_tbl, 0, sizeof(ms_tbl));
603         memset(&mz, 0, sizeof(struct rte_memzone));
604         memset(&ms, 0, sizeof(struct rte_memseg));
605
606         /* first, build a table of memsegs to map, to avoid failed mmaps due to
607          * overlaps
608          */
609         for (i = 0; i < ivshmem_config->segment_idx && i <= RTE_MAX_MEMSEG; i++) {
610                 if (i == RTE_MAX_MEMSEG) {
611                         RTE_LOG(ERR, EAL, "Too many segments requested!\n");
612                         return -1;
613                 }
614
615                 seg = &ivshmem_config->segment[i];
616
617                 /* copy segment to table */
618                 memcpy(&ms_tbl[i], seg, sizeof(struct ivshmem_segment));
619
620                 /* find ioremap addr */
621                 for (j = 0; j < DIM(ivshmem_config->pci_devs); j++) {
622                         pci_dev = &ivshmem_config->pci_devs[j];
623                         if (!strncmp(pci_dev->path, seg->path, sizeof(pci_dev->path))) {
624                                 ioremap_addr = pci_dev->ioremap_addr;
625                                 break;
626                         }
627                 }
628                 if (ioremap_addr == 0) {
629                         RTE_LOG(ERR, EAL, "Cannot find ioremap addr!\n");
630                         return -1;
631                 }
632
633                 /* work out alignments */
634                 align = seg->entry.mz.addr_64 -
635                                 RTE_ALIGN_FLOOR(seg->entry.mz.addr_64, 0x1000);
636                 len = RTE_ALIGN_CEIL(seg->entry.mz.len + align, 0x1000);
637
638                 /* save original alignments */
639                 ms_tbl[i].align = align;
640
641                 /* create a memory zone */
642                 mz.addr_64 = seg->entry.mz.addr_64 - align;
643                 mz.len = len;
644                 mz.hugepage_sz = seg->entry.mz.hugepage_sz;
645                 mz.phys_addr = seg->entry.mz.phys_addr - align;
646
647                 /* find true physical address */
648                 mz.ioremap_addr = ioremap_addr + seg->entry.offset - align;
649
650                 ms_tbl[i].entry.offset = seg->entry.offset - align;
651
652                 memcpy(&ms_tbl[i].entry.mz, &mz, sizeof(struct rte_memzone));
653         }
654
655         /* clean up the segments */
656         memseg_idx = cleanup_segments(ms_tbl, ivshmem_config->segment_idx);
657
658         if (memseg_idx < 0)
659                 return -1;
660
661         mcfg = rte_eal_get_configuration()->mem_config;
662
663         fd_zero = open("/dev/zero", O_RDWR);
664
665         if (fd_zero < 0) {
666                 RTE_LOG(ERR, EAL, "Cannot open /dev/zero: %s\n", strerror(errno));
667                 return -1;
668         }
669
670         /* create memsegs and put them into DPDK memory */
671         for (i = 0; i < (unsigned) memseg_idx; i++) {
672
673                 seg = &ms_tbl[i];
674
675                 ms.addr_64 = seg->entry.mz.addr_64;
676                 ms.hugepage_sz = seg->entry.mz.hugepage_sz;
677                 ms.len = seg->entry.mz.len;
678                 ms.nchannel = rte_memory_get_nchannel();
679                 ms.nrank = rte_memory_get_nrank();
680                 ms.phys_addr = seg->entry.mz.phys_addr;
681                 ms.ioremap_addr = seg->entry.mz.ioremap_addr;
682                 ms.socket_id = seg->entry.mz.socket_id;
683
684                 base_addr = mmap(ms.addr, ms.len,
685                                 PROT_READ | PROT_WRITE, MAP_PRIVATE, fd_zero, 0);
686
687                 if (base_addr == MAP_FAILED || base_addr != ms.addr) {
688                         RTE_LOG(ERR, EAL, "Cannot map /dev/zero!\n");
689                         return -1;
690                 }
691
692                 fd = open(seg->path, O_RDWR);
693
694                 if (fd < 0) {
695                         RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", seg->path,
696                                         strerror(errno));
697                         return -1;
698                 }
699
700                 munmap(ms.addr, ms.len);
701
702                 base_addr = mmap(ms.addr, ms.len,
703                                 PROT_READ | PROT_WRITE, MAP_SHARED, fd,
704                                 seg->entry.offset);
705
706
707                 if (base_addr == MAP_FAILED || base_addr != ms.addr) {
708                         RTE_LOG(ERR, EAL, "Cannot map segment into memory: "
709                                         "expected %p got %p (%s)\n", ms.addr, base_addr,
710                                         strerror(errno));
711                         return -1;
712                 }
713
714                 RTE_LOG(DEBUG, EAL, "Memory segment mapped: %p (len %" PRIx64 ") at "
715                                 "offset 0x%" PRIx64 "\n",
716                                 ms.addr, ms.len, seg->entry.offset);
717
718                 /* put the pointers back into their real positions using original
719                  * alignment */
720                 ms.addr_64 += seg->align;
721                 ms.phys_addr += seg->align;
722                 ms.ioremap_addr += seg->align;
723                 ms.len -= seg->align;
724
725                 /* at this point, the rest of DPDK memory is not initialized, so we
726                  * expect memsegs to be empty */
727                 memcpy(&mcfg->memseg[i], &ms,
728                                 sizeof(struct rte_memseg));
729                 memcpy(&mcfg->free_memseg[i], &ms,
730                                 sizeof(struct rte_memseg));
731
732
733                 /* adjust the free_memseg so that there's no free space left */
734                 mcfg->free_memseg[i].ioremap_addr += mcfg->free_memseg[i].len;
735                 mcfg->free_memseg[i].phys_addr += mcfg->free_memseg[i].len;
736                 mcfg->free_memseg[i].addr_64 += mcfg->free_memseg[i].len;
737                 mcfg->free_memseg[i].len = 0;
738
739                 close(fd);
740
741                 RTE_LOG(DEBUG, EAL, "IVSHMEM segment found, size: 0x%lx\n",
742                                 ms.len);
743         }
744
745         return 0;
746 }
747
748 /* this happens at a later stage, after general EAL memory initialization */
749 int
750 rte_eal_ivshmem_obj_init(void)
751 {
752         struct rte_ring_list* ring_list = NULL;
753         struct rte_mem_config * mcfg;
754         struct ivshmem_segment * seg;
755         struct rte_memzone * mz;
756         struct rte_ring * r;
757         unsigned i, ms, idx;
758         uint64_t offset;
759
760         /* secondary process would not need any object discovery - it'll all
761          * already be in shared config */
762         if (rte_eal_process_type() != RTE_PROC_PRIMARY || ivshmem_config == NULL)
763                 return 0;
764
765         /* check that we have an initialised ring tail queue */
766         if ((ring_list =
767              RTE_TAILQ_LOOKUP_BY_IDX(RTE_TAILQ_RING, rte_ring_list)) == NULL) {
768                 RTE_LOG(ERR, EAL, "No rte_ring tailq found!\n");
769                 return -1;
770         }
771
772         mcfg = rte_eal_get_configuration()->mem_config;
773
774         /* create memzones */
775         for (i = 0; i < ivshmem_config->segment_idx && i <= RTE_MAX_MEMZONE; i++) {
776
777                 seg = &ivshmem_config->segment[i];
778
779                 /* add memzone */
780                 if (mcfg->memzone_idx == RTE_MAX_MEMZONE) {
781                         RTE_LOG(ERR, EAL, "No more memory zones available!\n");
782                         return -1;
783                 }
784
785                 idx = mcfg->memzone_idx;
786
787                 RTE_LOG(DEBUG, EAL, "Found memzone: '%s' at %p (len 0x%" PRIx64 ")\n",
788                                 seg->entry.mz.name, seg->entry.mz.addr, seg->entry.mz.len);
789
790                 memcpy(&mcfg->memzone[idx], &seg->entry.mz,
791                                 sizeof(struct rte_memzone));
792
793                 /* find ioremap address */
794                 for (ms = 0; ms <= RTE_MAX_MEMSEG; ms++) {
795                         if (ms == RTE_MAX_MEMSEG) {
796                                 RTE_LOG(ERR, EAL, "Physical address of segment not found!\n");
797                                 return -1;
798                         }
799                         if (CONTAINS(mcfg->memseg[ms], mcfg->memzone[idx])) {
800                                 offset = mcfg->memzone[idx].addr_64 -
801                                                                 mcfg->memseg[ms].addr_64;
802                                 mcfg->memzone[idx].ioremap_addr = mcfg->memseg[ms].ioremap_addr +
803                                                 offset;
804                                 break;
805                         }
806                 }
807
808                 mcfg->memzone_idx++;
809         }
810
811         /* find rings */
812         for (i = 0; i < mcfg->memzone_idx; i++) {
813                 mz = &mcfg->memzone[i];
814
815                 /* check if memzone has a ring prefix */
816                 if (strncmp(mz->name, RTE_RING_MZ_PREFIX,
817                                 sizeof(RTE_RING_MZ_PREFIX) - 1) != 0)
818                         continue;
819
820                 r = (struct rte_ring*) (mz->addr_64);
821
822                 TAILQ_INSERT_TAIL(ring_list, r, next);
823
824                 RTE_LOG(DEBUG, EAL, "Found ring: '%s' at %p\n", r->name, mz->addr);
825         }
826
827 #ifdef RTE_LIBRTE_IVSHMEM_DEBUG
828         rte_memzone_dump();
829         rte_ring_list_dump();
830 #endif
831
832         return 0;
833 }
834
835 /* initialize ivshmem structures */
836 int rte_eal_ivshmem_init(void)
837 {
838         struct rte_pci_device * dev;
839         struct rte_pci_resource * res;
840         int fd, ret;
841         char path[PATH_MAX];
842
843         /* initialize everything to 0 */
844         memset(path, 0, sizeof(path));
845         ivshmem_config = NULL;
846         
847         pagesz = getpagesize();
848
849         RTE_LOG(DEBUG, EAL, "Searching for IVSHMEM devices...\n");
850
851         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
852
853                 if (open_shared_config() < 0) {
854                         RTE_LOG(ERR, EAL, "Could not open IVSHMEM config!\n");
855                         return -1;
856                 }
857         }
858         else {
859
860                 TAILQ_FOREACH(dev, &pci_device_list, next) {
861
862                         if (is_ivshmem_device(dev)) {
863
864                                 /* IVSHMEM memory is always on BAR2 */
865                                 res = &dev->mem_resource[2];
866
867                                 /* if we don't have a BAR2 */
868                                 if (res->len == 0)
869                                         continue;
870
871                                 /* construct pci device path */
872                                 rte_snprintf(path, sizeof(path), IVSHMEM_RESOURCE_PATH,
873                                                 dev->addr.domain, dev->addr.bus, dev->addr.devid,
874                                                 dev->addr.function);
875
876                                 /* try to find memseg */
877                                 fd = open(path, O_RDWR);
878                                 if (fd < 0) {
879                                         RTE_LOG(ERR, EAL, "Could not open %s\n", path);
880                                         return -1;
881                                 }
882
883                                 /* check if it's a DPDK IVSHMEM device */
884                                 ret = has_ivshmem_metadata(fd, res->len);
885
886                                 /* is DPDK device */
887                                 if (ret == 1) {
888
889                                         /* config file creation is deferred until the first
890                                          * DPDK device is found. then, it has to be created
891                                          * only once. */
892                                         if (ivshmem_config == NULL &&
893                                                         create_shared_config() < 0) {
894                                                 RTE_LOG(ERR, EAL, "Could not create IVSHMEM config!\n");
895                                                 close(fd);
896                                                 return -1;
897                                         }
898
899                                         if (read_metadata(path, sizeof(path), fd, res->len) < 0) {
900                                                 RTE_LOG(ERR, EAL, "Could not read metadata from"
901                                                                 " device %02x:%02x.%x!\n", dev->addr.bus,
902                                                                 dev->addr.devid, dev->addr.function);
903                                                 close(fd);
904                                                 return -1;
905                                         }
906
907                                         if (ivshmem_config->pci_devs_idx == RTE_LIBRTE_IVSHMEM_MAX_PCI_DEVS) {
908                                                 RTE_LOG(WARNING, EAL,
909                                                                 "IVSHMEM PCI device limit exceeded. Increase "
910                                                                 "CONFIG_RTE_LIBRTE_IVSHMEM_MAX_PCI_DEVS  in "
911                                                                 "your config file.\n");
912                                                 break;
913                                         }
914
915                                         RTE_LOG(INFO, EAL, "Found IVSHMEM device %02x:%02x.%x\n",
916                                                         dev->addr.bus, dev->addr.devid, dev->addr.function);
917
918                                         ivshmem_config->pci_devs[ivshmem_config->pci_devs_idx].ioremap_addr = res->phys_addr;
919                                         rte_snprintf(ivshmem_config->pci_devs[ivshmem_config->pci_devs_idx].path,
920                                                         sizeof(ivshmem_config->pci_devs[ivshmem_config->pci_devs_idx].path),
921                                                         path);
922
923                                         ivshmem_config->pci_devs_idx++;
924                                 }
925                                 /* failed to read */
926                                 else if (ret < 0) {
927                                         RTE_LOG(ERR, EAL, "Could not read IVSHMEM device: %s\n",
928                                                         strerror(errno));
929                                         close(fd);
930                                         return -1;
931                                 }
932                                 /* not a DPDK device */
933                                 else
934                                         RTE_LOG(DEBUG, EAL, "Skipping non-DPDK IVSHMEM device\n");
935
936                                 /* close the BAR fd */
937                                 close(fd);
938                         }
939                 }
940         }
941
942         /* ivshmem_config is not NULL only if config was created and/or mapped */
943         if (ivshmem_config) {
944                 if (map_all_segments() < 0) {
945                         RTE_LOG(ERR, EAL, "Mapping IVSHMEM segments failed!\n");
946                         return -1;
947                 }
948         }
949         else {
950                 RTE_LOG(DEBUG, EAL, "No IVSHMEM configuration found! \n");
951         }
952
953         return 0;
954 }
955
956 #endif