remove extra parentheses in return statement
[dpdk.git] / lib / librte_eal / linuxapp / eal / eal_ivshmem.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #ifdef RTE_LIBRTE_IVSHMEM /* hide it from coverage */
35
36 #include <stdint.h>
37 #include <unistd.h>
38 #include <inttypes.h>
39 #include <sys/mman.h>
40 #include <sys/file.h>
41 #include <string.h>
42 #include <sys/queue.h>
43
44 #include <rte_log.h>
45 #include <rte_pci.h>
46 #include <rte_memory.h>
47 #include <rte_eal.h>
48 #include <rte_eal_memconfig.h>
49 #include <rte_string_fns.h>
50 #include <rte_errno.h>
51 #include <rte_ring.h>
52 #include <rte_mempool.h>
53 #include <rte_malloc.h>
54 #include <rte_common.h>
55 #include <rte_ivshmem.h>
56
57 #include "eal_internal_cfg.h"
58 #include "eal_private.h"
59
60 #define PCI_VENDOR_ID_IVSHMEM 0x1Af4
61 #define PCI_DEVICE_ID_IVSHMEM 0x1110
62
63 #define IVSHMEM_MAGIC 0x0BADC0DE
64
65 #define IVSHMEM_RESOURCE_PATH "/sys/bus/pci/devices/%04x:%02x:%02x.%x/resource2"
66 #define IVSHMEM_CONFIG_PATH "/var/run/.%s_ivshmem_config"
67
68 #define PHYS 0x1
69 #define VIRT 0x2
70 #define IOREMAP 0x4
71 #define FULL (PHYS|VIRT|IOREMAP)
72
73 #define METADATA_SIZE_ALIGNED \
74         (RTE_ALIGN_CEIL(sizeof(struct rte_ivshmem_metadata),pagesz))
75
76 #define CONTAINS(x,y)\
77         (((y).addr_64 >= (x).addr_64) && ((y).addr_64 < (x).addr_64 + (x).len))
78
79 #define DIM(x) (sizeof(x)/sizeof(x[0]))
80
81 struct ivshmem_pci_device {
82         char path[PATH_MAX];
83         phys_addr_t ioremap_addr;
84 };
85
86 /* data type to store in config */
87 struct ivshmem_segment {
88         struct rte_ivshmem_metadata_entry entry;
89         uint64_t align;
90         char path[PATH_MAX];
91 };
92 struct ivshmem_shared_config {
93         struct ivshmem_segment segment[RTE_MAX_MEMSEG];
94         uint32_t segment_idx;
95         struct ivshmem_pci_device pci_devs[RTE_LIBRTE_IVSHMEM_MAX_PCI_DEVS];
96         uint32_t pci_devs_idx;
97 };
98 static struct ivshmem_shared_config * ivshmem_config;
99 static int memseg_idx;
100 static int pagesz;
101
102 /* Tailq heads to add rings to */
103 TAILQ_HEAD(rte_ring_list, rte_tailq_entry);
104
105 /*
106  * Utility functions
107  */
108
109 static int
110 is_ivshmem_device(struct rte_pci_device * dev)
111 {
112         return dev->id.vendor_id == PCI_VENDOR_ID_IVSHMEM
113                         && dev->id.device_id == PCI_DEVICE_ID_IVSHMEM;
114 }
115
116 static void *
117 map_metadata(int fd, uint64_t len)
118 {
119         size_t metadata_len = sizeof(struct rte_ivshmem_metadata);
120         size_t aligned_len = METADATA_SIZE_ALIGNED;
121
122         return mmap(NULL, metadata_len, PROT_READ | PROT_WRITE,
123                         MAP_SHARED, fd, len - aligned_len);
124 }
125
126 static void
127 unmap_metadata(void * ptr)
128 {
129         munmap(ptr, sizeof(struct rte_ivshmem_metadata));
130 }
131
132 static int
133 has_ivshmem_metadata(int fd, uint64_t len)
134 {
135         struct rte_ivshmem_metadata metadata;
136         void * ptr;
137
138         ptr = map_metadata(fd, len);
139
140         if (ptr == MAP_FAILED)
141                 return -1;
142
143         metadata = *(struct rte_ivshmem_metadata*) (ptr);
144
145         unmap_metadata(ptr);
146
147         return metadata.magic_number == IVSHMEM_MAGIC;
148 }
149
150 static void
151 remove_segment(struct ivshmem_segment * ms, int len, int idx)
152 {
153         int i;
154
155         for (i = idx; i < len - 1; i++)
156                 memcpy(&ms[i], &ms[i+1], sizeof(struct ivshmem_segment));
157         memset(&ms[len-1], 0, sizeof(struct ivshmem_segment));
158 }
159
160 static int
161 overlap(const struct rte_memzone * mz1, const struct rte_memzone * mz2)
162 {
163         uint64_t start1, end1, start2, end2;
164         uint64_t p_start1, p_end1, p_start2, p_end2;
165         uint64_t i_start1, i_end1, i_start2, i_end2;
166         int result = 0;
167
168         /* gather virtual addresses */
169         start1 = mz1->addr_64;
170         end1 = mz1->addr_64 + mz1->len;
171         start2 = mz2->addr_64;
172         end2 = mz2->addr_64 + mz2->len;
173
174         /* gather physical addresses */
175         p_start1 = mz1->phys_addr;
176         p_end1 = mz1->phys_addr + mz1->len;
177         p_start2 = mz2->phys_addr;
178         p_end2 = mz2->phys_addr + mz2->len;
179
180         /* gather ioremap addresses */
181         i_start1 = mz1->ioremap_addr;
182         i_end1 = mz1->ioremap_addr + mz1->len;
183         i_start2 = mz2->ioremap_addr;
184         i_end2 = mz2->ioremap_addr + mz2->len;
185
186         /* check for overlap in virtual addresses */
187         if (start1 >= start2 && start1 < end2)
188                 result |= VIRT;
189         if (start2 >= start1 && start2 < end1)
190                 result |= VIRT;
191
192         /* check for overlap in physical addresses */
193         if (p_start1 >= p_start2 && p_start1 < p_end2)
194                 result |= PHYS;
195         if (p_start2 >= p_start1 && p_start2 < p_end1)
196                 result |= PHYS;
197
198         /* check for overlap in ioremap addresses */
199         if (i_start1 >= i_start2 && i_start1 < i_end2)
200                 result |= IOREMAP;
201         if (i_start2 >= i_start1 && i_start2 < i_end1)
202                 result |= IOREMAP;
203
204         return result;
205 }
206
207 static int
208 adjacent(const struct rte_memzone * mz1, const struct rte_memzone * mz2)
209 {
210         uint64_t start1, end1, start2, end2;
211         uint64_t p_start1, p_end1, p_start2, p_end2;
212         uint64_t i_start1, i_end1, i_start2, i_end2;
213         int result = 0;
214
215         /* gather virtual addresses */
216         start1 = mz1->addr_64;
217         end1 = mz1->addr_64 + mz1->len;
218         start2 = mz2->addr_64;
219         end2 = mz2->addr_64 + mz2->len;
220
221         /* gather physical addresses */
222         p_start1 = mz1->phys_addr;
223         p_end1 = mz1->phys_addr + mz1->len;
224         p_start2 = mz2->phys_addr;
225         p_end2 = mz2->phys_addr + mz2->len;
226
227         /* gather ioremap addresses */
228         i_start1 = mz1->ioremap_addr;
229         i_end1 = mz1->ioremap_addr + mz1->len;
230         i_start2 = mz2->ioremap_addr;
231         i_end2 = mz2->ioremap_addr + mz2->len;
232
233         /* check if segments are virtually adjacent */
234         if (start1 == end2)
235                 result |= VIRT;
236         if (start2 == end1)
237                 result |= VIRT;
238
239         /* check if segments are physically adjacent */
240         if (p_start1 == p_end2)
241                 result |= PHYS;
242         if (p_start2 == p_end1)
243                 result |= PHYS;
244
245         /* check if segments are ioremap-adjacent */
246         if (i_start1 == i_end2)
247                 result |= IOREMAP;
248         if (i_start2 == i_end1)
249                 result |= IOREMAP;
250
251         return result;
252 }
253
254 static int
255 has_adjacent_segments(struct ivshmem_segment * ms, int len)
256 {
257         int i, j, a;
258
259         for (i = 0; i < len; i++)
260                 for (j = i + 1; j < len; j++) {
261                         a = adjacent(&ms[i].entry.mz, &ms[j].entry.mz);
262
263                         /* check if segments are adjacent virtually and/or physically but
264                          * not ioremap (since that would indicate that they are from
265                          * different PCI devices and thus don't need to be concatenated.
266                          */
267                         if ((a & (VIRT|PHYS)) > 0 && (a & IOREMAP) == 0)
268                                 return 1;
269                 }
270         return 0;
271 }
272
273 static int
274 has_overlapping_segments(struct ivshmem_segment * ms, int len)
275 {
276         int i, j;
277
278         for (i = 0; i < len; i++)
279                 for (j = i + 1; j < len; j++)
280                         if (overlap(&ms[i].entry.mz, &ms[j].entry.mz))
281                                 return 1;
282         return 0;
283 }
284
285 static int
286 seg_compare(const void * a, const void * b)
287 {
288         const struct ivshmem_segment * s1 = (const struct ivshmem_segment*) a;
289         const struct ivshmem_segment * s2 = (const struct ivshmem_segment*) b;
290
291         /* move unallocated zones to the end */
292         if (s1->entry.mz.addr == NULL && s2->entry.mz.addr == NULL)
293                 return 0;
294         if (s1->entry.mz.addr == 0)
295                 return 1;
296         if (s2->entry.mz.addr == 0)
297                 return -1;
298
299         return s1->entry.mz.phys_addr > s2->entry.mz.phys_addr;
300 }
301
302 #ifdef RTE_LIBRTE_IVSHMEM_DEBUG
303 static void
304 entry_dump(struct rte_ivshmem_metadata_entry *e)
305 {
306         RTE_LOG(DEBUG, EAL, "\tvirt: %p-%p\n", e->mz.addr,
307                         RTE_PTR_ADD(e->mz.addr, e->mz.len));
308         RTE_LOG(DEBUG, EAL, "\tphys: 0x%" PRIx64 "-0x%" PRIx64 "\n",
309                         e->mz.phys_addr,
310                         e->mz.phys_addr + e->mz.len);
311         RTE_LOG(DEBUG, EAL, "\tio: 0x%" PRIx64 "-0x%" PRIx64 "\n",
312                         e->mz.ioremap_addr,
313                         e->mz.ioremap_addr + e->mz.len);
314         RTE_LOG(DEBUG, EAL, "\tlen: 0x%" PRIx64 "\n", e->mz.len);
315         RTE_LOG(DEBUG, EAL, "\toff: 0x%" PRIx64 "\n", e->offset);
316 }
317 #endif
318
319
320
321 /*
322  * Actual useful code
323  */
324
325 /* read through metadata mapped from the IVSHMEM device */
326 static int
327 read_metadata(char * path, int path_len, int fd, uint64_t flen)
328 {
329         struct rte_ivshmem_metadata metadata;
330         struct rte_ivshmem_metadata_entry * entry;
331         int idx, i;
332         void * ptr;
333
334         ptr = map_metadata(fd, flen);
335
336         if (ptr == MAP_FAILED)
337                 return -1;
338
339         metadata = *(struct rte_ivshmem_metadata*) (ptr);
340
341         unmap_metadata(ptr);
342
343         RTE_LOG(DEBUG, EAL, "Parsing metadata for \"%s\"\n", metadata.name);
344
345         idx = ivshmem_config->segment_idx;
346
347         for (i = 0; i < RTE_LIBRTE_IVSHMEM_MAX_ENTRIES &&
348                 idx <= RTE_MAX_MEMSEG; i++) {
349
350                 if (idx == RTE_MAX_MEMSEG) {
351                         RTE_LOG(ERR, EAL, "Not enough memory segments!\n");
352                         return -1;
353                 }
354
355                 entry = &metadata.entry[i];
356
357                 /* stop on uninitialized memzone */
358                 if (entry->mz.len == 0)
359                         break;
360
361                 /* copy metadata entry */
362                 memcpy(&ivshmem_config->segment[idx].entry, entry,
363                                 sizeof(struct rte_ivshmem_metadata_entry));
364
365                 /* copy path */
366                 snprintf(ivshmem_config->segment[idx].path, path_len, "%s", path);
367
368                 idx++;
369         }
370         ivshmem_config->segment_idx = idx;
371
372         return 0;
373 }
374
375 /* check through each segment and look for adjacent or overlapping ones. */
376 static int
377 cleanup_segments(struct ivshmem_segment * ms, int tbl_len)
378 {
379         struct ivshmem_segment * s, * tmp;
380         int i, j, concat, seg_adjacent, seg_overlapping;
381         uint64_t start1, start2, end1, end2, p_start1, p_start2, i_start1, i_start2;
382
383         qsort(ms, tbl_len, sizeof(struct ivshmem_segment),
384                                 seg_compare);
385
386         while (has_overlapping_segments(ms, tbl_len) ||
387                         has_adjacent_segments(ms, tbl_len)) {
388
389                 for (i = 0; i < tbl_len; i++) {
390                         s = &ms[i];
391
392                         concat = 0;
393
394                         for (j = i + 1; j < tbl_len; j++) {
395                                 tmp = &ms[j];
396
397                                 /* check if this segment is overlapping with existing segment,
398                                  * or is adjacent to existing segment */
399                                 seg_overlapping = overlap(&s->entry.mz, &tmp->entry.mz);
400                                 seg_adjacent = adjacent(&s->entry.mz, &tmp->entry.mz);
401
402                                 /* check if segments fully overlap or are fully adjacent */
403                                 if ((seg_adjacent == FULL) || (seg_overlapping == FULL)) {
404
405 #ifdef RTE_LIBRTE_IVSHMEM_DEBUG
406                                         RTE_LOG(DEBUG, EAL, "Concatenating segments\n");
407                                         RTE_LOG(DEBUG, EAL, "Segment %i:\n", i);
408                                         entry_dump(&s->entry);
409                                         RTE_LOG(DEBUG, EAL, "Segment %i:\n", j);
410                                         entry_dump(&tmp->entry);
411 #endif
412
413                                         start1 = s->entry.mz.addr_64;
414                                         start2 = tmp->entry.mz.addr_64;
415                                         p_start1 = s->entry.mz.phys_addr;
416                                         p_start2 = tmp->entry.mz.phys_addr;
417                                         i_start1 = s->entry.mz.ioremap_addr;
418                                         i_start2 = tmp->entry.mz.ioremap_addr;
419                                         end1 = s->entry.mz.addr_64 + s->entry.mz.len;
420                                         end2 = tmp->entry.mz.addr_64 + tmp->entry.mz.len;
421
422                                         /* settle for minimum start address and maximum length */
423                                         s->entry.mz.addr_64 = RTE_MIN(start1, start2);
424                                         s->entry.mz.phys_addr = RTE_MIN(p_start1, p_start2);
425                                         s->entry.mz.ioremap_addr = RTE_MIN(i_start1, i_start2);
426                                         s->entry.offset = RTE_MIN(s->entry.offset, tmp->entry.offset);
427                                         s->entry.mz.len = RTE_MAX(end1, end2) - s->entry.mz.addr_64;
428                                         concat = 1;
429
430 #ifdef RTE_LIBRTE_IVSHMEM_DEBUG
431                                         RTE_LOG(DEBUG, EAL, "Resulting segment:\n");
432                                         entry_dump(&s->entry);
433
434 #endif
435                                 }
436                                 /* if segments not fully overlap, we have an error condition.
437                                  * adjacent segments can coexist.
438                                  */
439                                 else if (seg_overlapping > 0) {
440                                         RTE_LOG(ERR, EAL, "Segments %i and %i overlap!\n", i, j);
441 #ifdef RTE_LIBRTE_IVSHMEM_DEBUG
442                                         RTE_LOG(DEBUG, EAL, "Segment %i:\n", i);
443                                         entry_dump(&s->entry);
444                                         RTE_LOG(DEBUG, EAL, "Segment %i:\n", j);
445                                         entry_dump(&tmp->entry);
446 #endif
447                                         return -1;
448                                 }
449                                 if (concat)
450                                         break;
451                         }
452                         /* if we concatenated, remove segment at j */
453                         if (concat) {
454                                 remove_segment(ms, tbl_len, j);
455                                 tbl_len--;
456                                 break;
457                         }
458                 }
459         }
460
461         return tbl_len;
462 }
463
464 static int
465 create_shared_config(void)
466 {
467         char path[PATH_MAX];
468         int fd;
469
470         /* build ivshmem config file path */
471         snprintf(path, sizeof(path), IVSHMEM_CONFIG_PATH,
472                         internal_config.hugefile_prefix);
473
474         fd = open(path, O_CREAT | O_RDWR, 0600);
475
476         if (fd < 0) {
477                 RTE_LOG(ERR, EAL, "Could not open %s: %s\n", path, strerror(errno));
478                 return -1;
479         }
480
481         /* try ex-locking first - if the file is locked, we have a problem */
482         if (flock(fd, LOCK_EX | LOCK_NB) == -1) {
483                 RTE_LOG(ERR, EAL, "Locking %s failed: %s\n", path, strerror(errno));
484                 close(fd);
485                 return -1;
486         }
487
488         if (ftruncate(fd, sizeof(struct ivshmem_shared_config)) < 0) {
489                 RTE_LOG(ERR, EAL, "ftruncate failed: %s\n", strerror(errno));
490                 return -1;
491         }
492
493         ivshmem_config = mmap(NULL, sizeof(struct ivshmem_shared_config),
494                         PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
495
496         if (ivshmem_config == MAP_FAILED)
497                 return -1;
498
499         memset(ivshmem_config, 0, sizeof(struct ivshmem_shared_config));
500
501         /* change the exclusive lock we got earlier to a shared lock */
502         if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
503                 RTE_LOG(ERR, EAL, "Locking %s failed: %s \n", path, strerror(errno));
504                 return -1;
505         }
506
507         close(fd);
508
509         return 0;
510 }
511
512 /* open shared config file and, if present, map the config.
513  * having no config file is not an error condition, as we later check if
514  * ivshmem_config is NULL (if it is, that means nothing was mapped). */
515 static int
516 open_shared_config(void)
517 {
518         char path[PATH_MAX];
519         int fd;
520
521         /* build ivshmem config file path */
522         snprintf(path, sizeof(path), IVSHMEM_CONFIG_PATH,
523                         internal_config.hugefile_prefix);
524
525         fd = open(path, O_RDONLY);
526
527         /* if the file doesn't exist, just return success */
528         if (fd < 0 && errno == ENOENT)
529                 return 0;
530         /* else we have an error condition */
531         else if (fd < 0) {
532                 RTE_LOG(ERR, EAL, "Could not open %s: %s\n",
533                                 path, strerror(errno));
534                 return -1;
535         }
536
537         /* try ex-locking first - if the lock *does* succeed, this means it's a
538          * stray config file, so it should be deleted.
539          */
540         if (flock(fd, LOCK_EX | LOCK_NB) != -1) {
541
542                 /* if we can't remove the file, something is wrong */
543                 if (unlink(path) < 0) {
544                         RTE_LOG(ERR, EAL, "Could not remove %s: %s\n", path,
545                                         strerror(errno));
546                         return -1;
547                 }
548
549                 /* release the lock */
550                 flock(fd, LOCK_UN);
551                 close(fd);
552
553                 /* return success as having a stray config file is equivalent to not
554                  * having config file at all.
555                  */
556                 return 0;
557         }
558
559         ivshmem_config = mmap(NULL, sizeof(struct ivshmem_shared_config),
560                         PROT_READ, MAP_SHARED, fd, 0);
561
562         if (ivshmem_config == MAP_FAILED)
563                 return -1;
564
565         /* place a shared lock on config file */
566         if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
567                 RTE_LOG(ERR, EAL, "Locking %s failed: %s \n", path, strerror(errno));
568                 return -1;
569         }
570
571         close(fd);
572
573         return 0;
574 }
575
576 /*
577  * This function does the following:
578  *
579  * 1) Builds a table of ivshmem_segments with proper offset alignment
580  * 2) Cleans up that table so that we don't have any overlapping or adjacent
581  *    memory segments
582  * 3) Creates memsegs from this table and maps them into memory.
583  */
584 static inline int
585 map_all_segments(void)
586 {
587         struct ivshmem_segment ms_tbl[RTE_MAX_MEMSEG];
588         struct ivshmem_pci_device * pci_dev;
589         struct rte_mem_config * mcfg;
590         struct ivshmem_segment * seg;
591         int fd, fd_zero;
592         unsigned i, j;
593         struct rte_memzone mz;
594         struct rte_memseg ms;
595         void * base_addr;
596         uint64_t align, len;
597         phys_addr_t ioremap_addr;
598
599         ioremap_addr = 0;
600
601         memset(ms_tbl, 0, sizeof(ms_tbl));
602         memset(&mz, 0, sizeof(struct rte_memzone));
603         memset(&ms, 0, sizeof(struct rte_memseg));
604
605         /* first, build a table of memsegs to map, to avoid failed mmaps due to
606          * overlaps
607          */
608         for (i = 0; i < ivshmem_config->segment_idx && i <= RTE_MAX_MEMSEG; i++) {
609                 if (i == RTE_MAX_MEMSEG) {
610                         RTE_LOG(ERR, EAL, "Too many segments requested!\n");
611                         return -1;
612                 }
613
614                 seg = &ivshmem_config->segment[i];
615
616                 /* copy segment to table */
617                 memcpy(&ms_tbl[i], seg, sizeof(struct ivshmem_segment));
618
619                 /* find ioremap addr */
620                 for (j = 0; j < DIM(ivshmem_config->pci_devs); j++) {
621                         pci_dev = &ivshmem_config->pci_devs[j];
622                         if (!strncmp(pci_dev->path, seg->path, sizeof(pci_dev->path))) {
623                                 ioremap_addr = pci_dev->ioremap_addr;
624                                 break;
625                         }
626                 }
627                 if (ioremap_addr == 0) {
628                         RTE_LOG(ERR, EAL, "Cannot find ioremap addr!\n");
629                         return -1;
630                 }
631
632                 /* work out alignments */
633                 align = seg->entry.mz.addr_64 -
634                                 RTE_ALIGN_FLOOR(seg->entry.mz.addr_64, 0x1000);
635                 len = RTE_ALIGN_CEIL(seg->entry.mz.len + align, 0x1000);
636
637                 /* save original alignments */
638                 ms_tbl[i].align = align;
639
640                 /* create a memory zone */
641                 mz.addr_64 = seg->entry.mz.addr_64 - align;
642                 mz.len = len;
643                 mz.hugepage_sz = seg->entry.mz.hugepage_sz;
644                 mz.phys_addr = seg->entry.mz.phys_addr - align;
645
646                 /* find true physical address */
647                 mz.ioremap_addr = ioremap_addr + seg->entry.offset - align;
648
649                 ms_tbl[i].entry.offset = seg->entry.offset - align;
650
651                 memcpy(&ms_tbl[i].entry.mz, &mz, sizeof(struct rte_memzone));
652         }
653
654         /* clean up the segments */
655         memseg_idx = cleanup_segments(ms_tbl, ivshmem_config->segment_idx);
656
657         if (memseg_idx < 0)
658                 return -1;
659
660         mcfg = rte_eal_get_configuration()->mem_config;
661
662         fd_zero = open("/dev/zero", O_RDWR);
663
664         if (fd_zero < 0) {
665                 RTE_LOG(ERR, EAL, "Cannot open /dev/zero: %s\n", strerror(errno));
666                 return -1;
667         }
668
669         /* create memsegs and put them into DPDK memory */
670         for (i = 0; i < (unsigned) memseg_idx; i++) {
671
672                 seg = &ms_tbl[i];
673
674                 ms.addr_64 = seg->entry.mz.addr_64;
675                 ms.hugepage_sz = seg->entry.mz.hugepage_sz;
676                 ms.len = seg->entry.mz.len;
677                 ms.nchannel = rte_memory_get_nchannel();
678                 ms.nrank = rte_memory_get_nrank();
679                 ms.phys_addr = seg->entry.mz.phys_addr;
680                 ms.ioremap_addr = seg->entry.mz.ioremap_addr;
681                 ms.socket_id = seg->entry.mz.socket_id;
682
683                 base_addr = mmap(ms.addr, ms.len,
684                                 PROT_READ | PROT_WRITE, MAP_PRIVATE, fd_zero, 0);
685
686                 if (base_addr == MAP_FAILED || base_addr != ms.addr) {
687                         RTE_LOG(ERR, EAL, "Cannot map /dev/zero!\n");
688                         return -1;
689                 }
690
691                 fd = open(seg->path, O_RDWR);
692
693                 if (fd < 0) {
694                         RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", seg->path,
695                                         strerror(errno));
696                         return -1;
697                 }
698
699                 munmap(ms.addr, ms.len);
700
701                 base_addr = mmap(ms.addr, ms.len,
702                                 PROT_READ | PROT_WRITE, MAP_SHARED, fd,
703                                 seg->entry.offset);
704
705
706                 if (base_addr == MAP_FAILED || base_addr != ms.addr) {
707                         RTE_LOG(ERR, EAL, "Cannot map segment into memory: "
708                                         "expected %p got %p (%s)\n", ms.addr, base_addr,
709                                         strerror(errno));
710                         return -1;
711                 }
712
713                 RTE_LOG(DEBUG, EAL, "Memory segment mapped: %p (len %" PRIx64 ") at "
714                                 "offset 0x%" PRIx64 "\n",
715                                 ms.addr, ms.len, seg->entry.offset);
716
717                 /* put the pointers back into their real positions using original
718                  * alignment */
719                 ms.addr_64 += seg->align;
720                 ms.phys_addr += seg->align;
721                 ms.ioremap_addr += seg->align;
722                 ms.len -= seg->align;
723
724                 /* at this point, the rest of DPDK memory is not initialized, so we
725                  * expect memsegs to be empty */
726                 memcpy(&mcfg->memseg[i], &ms,
727                                 sizeof(struct rte_memseg));
728
729                 close(fd);
730
731                 RTE_LOG(DEBUG, EAL, "IVSHMEM segment found, size: 0x%lx\n",
732                                 ms.len);
733         }
734
735         return 0;
736 }
737
738 /* this happens at a later stage, after general EAL memory initialization */
739 int
740 rte_eal_ivshmem_obj_init(void)
741 {
742         struct rte_ring_list* ring_list = NULL;
743         struct rte_mem_config * mcfg;
744         struct ivshmem_segment * seg;
745         struct rte_memzone * mz;
746         struct rte_ring * r;
747         struct rte_tailq_entry *te;
748         unsigned i, ms, idx;
749         uint64_t offset;
750
751         /* secondary process would not need any object discovery - it'll all
752          * already be in shared config */
753         if (rte_eal_process_type() != RTE_PROC_PRIMARY || ivshmem_config == NULL)
754                 return 0;
755
756         /* check that we have an initialised ring tail queue */
757         ring_list = RTE_TAILQ_LOOKUP(RTE_TAILQ_RING_NAME, rte_ring_list);
758         if (ring_list == NULL) {
759                 RTE_LOG(ERR, EAL, "No rte_ring tailq found!\n");
760                 return -1;
761         }
762
763         mcfg = rte_eal_get_configuration()->mem_config;
764
765         /* create memzones */
766         for (i = 0; i < ivshmem_config->segment_idx && i <= RTE_MAX_MEMZONE; i++) {
767
768                 seg = &ivshmem_config->segment[i];
769
770                 /* add memzone */
771                 if (mcfg->memzone_cnt == RTE_MAX_MEMZONE) {
772                         RTE_LOG(ERR, EAL, "No more memory zones available!\n");
773                         return -1;
774                 }
775
776                 idx = mcfg->memzone_cnt;
777
778                 RTE_LOG(DEBUG, EAL, "Found memzone: '%s' at %p (len 0x%" PRIx64 ")\n",
779                                 seg->entry.mz.name, seg->entry.mz.addr, seg->entry.mz.len);
780
781                 memcpy(&mcfg->memzone[idx], &seg->entry.mz,
782                                 sizeof(struct rte_memzone));
783
784                 /* find ioremap address */
785                 for (ms = 0; ms <= RTE_MAX_MEMSEG; ms++) {
786                         if (ms == RTE_MAX_MEMSEG) {
787                                 RTE_LOG(ERR, EAL, "Physical address of segment not found!\n");
788                                 return -1;
789                         }
790                         if (CONTAINS(mcfg->memseg[ms], mcfg->memzone[idx])) {
791                                 offset = mcfg->memzone[idx].addr_64 -
792                                                                 mcfg->memseg[ms].addr_64;
793                                 mcfg->memzone[idx].ioremap_addr = mcfg->memseg[ms].ioremap_addr +
794                                                 offset;
795                                 break;
796                         }
797                 }
798
799                 mcfg->memzone_cnt++;
800         }
801
802         rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
803
804         /* find rings */
805         for (i = 0; i < mcfg->memzone_cnt; i++) {
806                 mz = &mcfg->memzone[i];
807
808                 /* check if memzone has a ring prefix */
809                 if (strncmp(mz->name, RTE_RING_MZ_PREFIX,
810                                 sizeof(RTE_RING_MZ_PREFIX) - 1) != 0)
811                         continue;
812
813                 r = (struct rte_ring*) (mz->addr_64);
814
815                 te = rte_zmalloc("RING_TAILQ_ENTRY", sizeof(*te), 0);
816                 if (te == NULL) {
817                         RTE_LOG(ERR, EAL, "Cannot allocate ring tailq entry!\n");
818                         return -1;
819                 }
820
821                 te->data = (void *) r;
822
823                 TAILQ_INSERT_TAIL(ring_list, te, next);
824
825                 RTE_LOG(DEBUG, EAL, "Found ring: '%s' at %p\n", r->name, mz->addr);
826         }
827         rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
828
829 #ifdef RTE_LIBRTE_IVSHMEM_DEBUG
830         rte_memzone_dump(stdout);
831         rte_ring_list_dump(stdout);
832 #endif
833
834         return 0;
835 }
836
837 /* initialize ivshmem structures */
838 int rte_eal_ivshmem_init(void)
839 {
840         struct rte_pci_device * dev;
841         struct rte_pci_resource * res;
842         int fd, ret;
843         char path[PATH_MAX];
844
845         /* initialize everything to 0 */
846         memset(path, 0, sizeof(path));
847         ivshmem_config = NULL;
848
849         pagesz = getpagesize();
850
851         RTE_LOG(DEBUG, EAL, "Searching for IVSHMEM devices...\n");
852
853         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
854
855                 if (open_shared_config() < 0) {
856                         RTE_LOG(ERR, EAL, "Could not open IVSHMEM config!\n");
857                         return -1;
858                 }
859         }
860         else {
861
862                 TAILQ_FOREACH(dev, &pci_device_list, next) {
863
864                         if (is_ivshmem_device(dev)) {
865
866                                 /* IVSHMEM memory is always on BAR2 */
867                                 res = &dev->mem_resource[2];
868
869                                 /* if we don't have a BAR2 */
870                                 if (res->len == 0)
871                                         continue;
872
873                                 /* construct pci device path */
874                                 snprintf(path, sizeof(path), IVSHMEM_RESOURCE_PATH,
875                                                 dev->addr.domain, dev->addr.bus, dev->addr.devid,
876                                                 dev->addr.function);
877
878                                 /* try to find memseg */
879                                 fd = open(path, O_RDWR);
880                                 if (fd < 0) {
881                                         RTE_LOG(ERR, EAL, "Could not open %s\n", path);
882                                         return -1;
883                                 }
884
885                                 /* check if it's a DPDK IVSHMEM device */
886                                 ret = has_ivshmem_metadata(fd, res->len);
887
888                                 /* is DPDK device */
889                                 if (ret == 1) {
890
891                                         /* config file creation is deferred until the first
892                                          * DPDK device is found. then, it has to be created
893                                          * only once. */
894                                         if (ivshmem_config == NULL &&
895                                                         create_shared_config() < 0) {
896                                                 RTE_LOG(ERR, EAL, "Could not create IVSHMEM config!\n");
897                                                 close(fd);
898                                                 return -1;
899                                         }
900
901                                         if (read_metadata(path, sizeof(path), fd, res->len) < 0) {
902                                                 RTE_LOG(ERR, EAL, "Could not read metadata from"
903                                                                 " device %02x:%02x.%x!\n", dev->addr.bus,
904                                                                 dev->addr.devid, dev->addr.function);
905                                                 close(fd);
906                                                 return -1;
907                                         }
908
909                                         if (ivshmem_config->pci_devs_idx == RTE_LIBRTE_IVSHMEM_MAX_PCI_DEVS) {
910                                                 RTE_LOG(WARNING, EAL,
911                                                                 "IVSHMEM PCI device limit exceeded. Increase "
912                                                                 "CONFIG_RTE_LIBRTE_IVSHMEM_MAX_PCI_DEVS  in "
913                                                                 "your config file.\n");
914                                                 break;
915                                         }
916
917                                         RTE_LOG(INFO, EAL, "Found IVSHMEM device %02x:%02x.%x\n",
918                                                         dev->addr.bus, dev->addr.devid, dev->addr.function);
919
920                                         ivshmem_config->pci_devs[ivshmem_config->pci_devs_idx].ioremap_addr = res->phys_addr;
921                                         snprintf(ivshmem_config->pci_devs[ivshmem_config->pci_devs_idx].path,
922                                                         sizeof(ivshmem_config->pci_devs[ivshmem_config->pci_devs_idx].path),
923                                                         "%s", path);
924
925                                         ivshmem_config->pci_devs_idx++;
926                                 }
927                                 /* failed to read */
928                                 else if (ret < 0) {
929                                         RTE_LOG(ERR, EAL, "Could not read IVSHMEM device: %s\n",
930                                                         strerror(errno));
931                                         close(fd);
932                                         return -1;
933                                 }
934                                 /* not a DPDK device */
935                                 else
936                                         RTE_LOG(DEBUG, EAL, "Skipping non-DPDK IVSHMEM device\n");
937
938                                 /* close the BAR fd */
939                                 close(fd);
940                         }
941                 }
942         }
943
944         /* ivshmem_config is not NULL only if config was created and/or mapped */
945         if (ivshmem_config) {
946                 if (map_all_segments() < 0) {
947                         RTE_LOG(ERR, EAL, "Mapping IVSHMEM segments failed!\n");
948                         return -1;
949                 }
950         }
951         else {
952                 RTE_LOG(DEBUG, EAL, "No IVSHMEM configuration found! \n");
953         }
954
955         return 0;
956 }
957
958 #endif