net/vdev_netvsc: fix routed devices probing
[dpdk.git] / drivers / net / vdev_netvsc / vdev_netvsc.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2017 6WIND S.A.
3  * Copyright 2017 Mellanox Technologies, Ltd.
4  */
5
6 #include <errno.h>
7 #include <fcntl.h>
8 #include <inttypes.h>
9 #include <linux/sockios.h>
10 #include <linux/netlink.h>
11 #include <linux/rtnetlink.h>
12 #include <net/if.h>
13 #include <net/if_arp.h>
14 #include <netinet/ip.h>
15 #include <stdarg.h>
16 #include <stddef.h>
17 #include <stdlib.h>
18 #include <stdint.h>
19 #include <stdio.h>
20 #include <string.h>
21 #include <sys/ioctl.h>
22 #include <sys/queue.h>
23 #include <sys/socket.h>
24 #include <unistd.h>
25
26 #include <rte_alarm.h>
27 #include <rte_bus.h>
28 #include <rte_bus_vdev.h>
29 #include <rte_common.h>
30 #include <rte_config.h>
31 #include <rte_dev.h>
32 #include <rte_errno.h>
33 #include <rte_ethdev.h>
34 #include <rte_ether.h>
35 #include <rte_hypervisor.h>
36 #include <rte_kvargs.h>
37 #include <rte_log.h>
38
39 #define VDEV_NETVSC_DRIVER net_vdev_netvsc
40 #define VDEV_NETVSC_DRIVER_NAME RTE_STR(VDEV_NETVSC_DRIVER)
41 #define VDEV_NETVSC_ARG_IFACE "iface"
42 #define VDEV_NETVSC_ARG_MAC "mac"
43 #define VDEV_NETVSC_ARG_FORCE "force"
44 #define VDEV_NETVSC_ARG_IGNORE "ignore"
45 #define VDEV_NETVSC_PROBE_MS 1000
46
47 #define NETVSC_CLASS_ID "{f8615163-df3e-46c5-913f-f2d2f965ed0e}"
48 #define NETVSC_MAX_ROUTE_LINE_SIZE 300
49
50 #define DRV_LOG(level, ...) \
51         rte_log(RTE_LOG_ ## level, \
52                 vdev_netvsc_logtype, \
53                 RTE_FMT(VDEV_NETVSC_DRIVER_NAME ": " \
54                         RTE_FMT_HEAD(__VA_ARGS__,) "\n", \
55                 RTE_FMT_TAIL(__VA_ARGS__,)))
56
57 /** Driver-specific log messages type. */
58 static int vdev_netvsc_logtype;
59
60 /** Context structure for a vdev_netvsc instance. */
61 struct vdev_netvsc_ctx {
62         LIST_ENTRY(vdev_netvsc_ctx) entry; /**< Next entry in list. */
63         unsigned int id;                   /**< Unique ID. */
64         char name[64];                     /**< Unique name. */
65         char devname[64];                  /**< Fail-safe instance name. */
66         char devargs[256];                 /**< Fail-safe device arguments. */
67         char if_name[IF_NAMESIZE];         /**< NetVSC netdevice name. */
68         unsigned int if_index;             /**< NetVSC netdevice index. */
69         struct ether_addr if_addr;         /**< NetVSC MAC address. */
70         int pipe[2];                       /**< Fail-safe communication pipe. */
71         char yield[256];                   /**< PCI sub-device arguments. */
72 };
73
74 /** Context list is common to all driver instances. */
75 static LIST_HEAD(, vdev_netvsc_ctx) vdev_netvsc_ctx_list =
76         LIST_HEAD_INITIALIZER(vdev_netvsc_ctx_list);
77
78 /** Number of entries in context list. */
79 static unsigned int vdev_netvsc_ctx_count;
80
81 /** Number of driver instances relying on context list. */
82 static unsigned int vdev_netvsc_ctx_inst;
83
84 /**
85  * Destroy a vdev_netvsc context instance.
86  *
87  * @param ctx
88  *   Context to destroy.
89  */
90 static void
91 vdev_netvsc_ctx_destroy(struct vdev_netvsc_ctx *ctx)
92 {
93         if (ctx->pipe[0] != -1)
94                 close(ctx->pipe[0]);
95         if (ctx->pipe[1] != -1)
96                 close(ctx->pipe[1]);
97         free(ctx);
98 }
99
100 /**
101  * Iterate over system network interfaces.
102  *
103  * This function runs a given callback function for each netdevice found on
104  * the system.
105  *
106  * @param func
107  *   Callback function pointer. List traversal is aborted when this function
108  *   returns a nonzero value.
109  * @param ...
110  *   Variable parameter list passed as @p va_list to @p func.
111  *
112  * @return
113  *   0 when the entire list is traversed successfully, a negative error code
114  *   in case or failure, or the nonzero value returned by @p func when list
115  *   traversal is aborted.
116  */
117 static int
118 vdev_netvsc_foreach_iface(int (*func)(const struct if_nameindex *iface,
119                                       const struct ether_addr *eth_addr,
120                                       va_list ap), ...)
121 {
122         struct if_nameindex *iface = if_nameindex();
123         int s = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
124         unsigned int i;
125         int ret = 0;
126
127         if (!iface) {
128                 ret = -ENOBUFS;
129                 DRV_LOG(ERR, "cannot retrieve system network interfaces");
130                 goto error;
131         }
132         if (s == -1) {
133                 ret = -errno;
134                 DRV_LOG(ERR, "cannot open socket: %s", rte_strerror(errno));
135                 goto error;
136         }
137         for (i = 0; iface[i].if_name; ++i) {
138                 struct ifreq req;
139                 struct ether_addr eth_addr;
140                 va_list ap;
141
142                 strncpy(req.ifr_name, iface[i].if_name, sizeof(req.ifr_name));
143                 if (ioctl(s, SIOCGIFHWADDR, &req) == -1) {
144                         DRV_LOG(WARNING, "cannot retrieve information about"
145                                          " interface \"%s\": %s",
146                                          req.ifr_name, rte_strerror(errno));
147                         continue;
148                 }
149                 if (req.ifr_hwaddr.sa_family != ARPHRD_ETHER) {
150                         DRV_LOG(DEBUG, "interface %s is non-ethernet device",
151                                 req.ifr_name);
152                         continue;
153                 }
154                 memcpy(eth_addr.addr_bytes, req.ifr_hwaddr.sa_data,
155                        RTE_DIM(eth_addr.addr_bytes));
156                 va_start(ap, func);
157                 ret = func(&iface[i], &eth_addr, ap);
158                 va_end(ap);
159                 if (ret)
160                         break;
161         }
162 error:
163         if (s != -1)
164                 close(s);
165         if (iface)
166                 if_freenameindex(iface);
167         return ret;
168 }
169
170 /**
171  * Determine if a network interface is NetVSC.
172  *
173  * @param[in] iface
174  *   Pointer to netdevice description structure (name and index).
175  *
176  * @return
177  *   A nonzero value when interface is detected as NetVSC. In case of error,
178  *   rte_errno is updated and 0 returned.
179  */
180 static int
181 vdev_netvsc_iface_is_netvsc(const struct if_nameindex *iface)
182 {
183         static const char temp[] = "/sys/class/net/%s/device/class_id";
184         char path[sizeof(temp) + IF_NAMESIZE];
185         FILE *f;
186         int ret;
187         int len = 0;
188
189         ret = snprintf(path, sizeof(path), temp, iface->if_name);
190         if (ret == -1 || (size_t)ret >= sizeof(path)) {
191                 rte_errno = ENOBUFS;
192                 return 0;
193         }
194         f = fopen(path, "r");
195         if (!f) {
196                 rte_errno = errno;
197                 return 0;
198         }
199         ret = fscanf(f, NETVSC_CLASS_ID "%n", &len);
200         if (ret == EOF)
201                 rte_errno = errno;
202         ret = len == (int)strlen(NETVSC_CLASS_ID);
203         fclose(f);
204         return ret;
205 }
206
207 /**
208  * Determine if a network interface has a route.
209  *
210  * @param[in] name
211  *   Network device name.
212  * @param[in] family
213  *   Address family: AF_INET for IPv4 or AF_INET6 for IPv6.
214  *
215  * @return
216  *   1 when interface has a route, negative errno value in case of error and
217  *   0 otherwise.
218  */
219 static int
220 vdev_netvsc_has_route(const struct if_nameindex *iface,
221                       const unsigned char family)
222 {
223         /*
224          * The implementation can be simpler by getifaddrs() function usage but
225          * it works for IPv6 only starting from glibc 2.3.3.
226          */
227         char buf[4096];
228         int len;
229         int ret = 0;
230         int res;
231         int sock;
232         struct nlmsghdr *retmsg = (struct nlmsghdr *)buf;
233         struct sockaddr_nl sa;
234         struct {
235                 struct nlmsghdr nlhdr;
236                 struct ifaddrmsg addrmsg;
237         } msg;
238
239         if (!iface || (family != AF_INET && family != AF_INET6)) {
240                 DRV_LOG(ERR, "%s", rte_strerror(EINVAL));
241                 return -EINVAL;
242         }
243         sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
244         if (sock == -1) {
245                 DRV_LOG(ERR, "cannot open socket: %s", rte_strerror(errno));
246                 return -errno;
247         }
248         memset(&sa, 0, sizeof(sa));
249         sa.nl_family = AF_NETLINK;
250         sa.nl_groups = RTMGRP_LINK | RTMGRP_IPV4_IFADDR;
251         res = bind(sock, (struct sockaddr *)&sa, sizeof(sa));
252         if (res == -1) {
253                 ret = -errno;
254                 DRV_LOG(ERR, "cannot bind socket: %s", rte_strerror(errno));
255                 goto close;
256         }
257         memset(&msg, 0, sizeof(msg));
258         msg.nlhdr.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg));
259         msg.nlhdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
260         msg.nlhdr.nlmsg_type = RTM_GETADDR;
261         msg.nlhdr.nlmsg_pid = getpid();
262         msg.addrmsg.ifa_family = family;
263         msg.addrmsg.ifa_index = iface->if_index;
264         res = send(sock, &msg, msg.nlhdr.nlmsg_len, 0);
265         if (res == -1) {
266                 ret = -errno;
267                 DRV_LOG(ERR, "cannot send socket message: %s",
268                         rte_strerror(errno));
269                 goto close;
270         }
271         memset(buf, 0, sizeof(buf));
272         len = recv(sock, buf, sizeof(buf), 0);
273         if (len == -1) {
274                 ret = -errno;
275                 DRV_LOG(ERR, "cannot receive socket message: %s",
276                         rte_strerror(errno));
277                 goto close;
278         }
279         while (NLMSG_OK(retmsg, (unsigned int)len)) {
280                 struct ifaddrmsg *retaddr =
281                                 (struct ifaddrmsg *)NLMSG_DATA(retmsg);
282
283                 if (retaddr->ifa_family == family &&
284                     retaddr->ifa_index == iface->if_index) {
285                         struct rtattr *retrta = IFA_RTA(retaddr);
286                         int attlen = IFA_PAYLOAD(retmsg);
287
288                         while (RTA_OK(retrta, attlen)) {
289                                 if (retrta->rta_type == IFA_ADDRESS) {
290                                         ret = 1;
291                                         DRV_LOG(DEBUG, "interface %s has IP",
292                                                 iface->if_name);
293                                         goto close;
294                                 }
295                                 retrta = RTA_NEXT(retrta, attlen);
296                         }
297                 }
298                 retmsg = NLMSG_NEXT(retmsg, len);
299         }
300 close:
301         close(sock);
302         return ret;
303 }
304
305 /**
306  * Retrieve network interface data from sysfs symbolic link.
307  *
308  * @param[out] buf
309  *   Output data buffer.
310  * @param size
311  *   Output buffer size.
312  * @param[in] if_name
313  *   Netdevice name.
314  * @param[in] relpath
315  *   Symbolic link path relative to netdevice sysfs entry.
316  *
317  * @return
318  *   0 on success, a negative error code otherwise.
319  */
320 static int
321 vdev_netvsc_sysfs_readlink(char *buf, size_t size, const char *if_name,
322                            const char *relpath)
323 {
324         int ret;
325
326         ret = snprintf(buf, size, "/sys/class/net/%s/%s", if_name, relpath);
327         if (ret == -1 || (size_t)ret >= size)
328                 return -ENOBUFS;
329         ret = readlink(buf, buf, size);
330         if (ret == -1)
331                 return -errno;
332         if ((size_t)ret >= size - 1)
333                 return -ENOBUFS;
334         buf[ret] = '\0';
335         return 0;
336 }
337
338 /**
339  * Probe a network interface to associate with vdev_netvsc context.
340  *
341  * This function determines if the network device matches the properties of
342  * the NetVSC interface associated with the vdev_netvsc context and
343  * communicates its bus address to the fail-safe PMD instance if so.
344  *
345  * It is normally used with vdev_netvsc_foreach_iface().
346  *
347  * @param[in] iface
348  *   Pointer to netdevice description structure (name and index).
349  * @param[in] eth_addr
350  *   MAC address associated with @p iface.
351  * @param ap
352  *   Variable arguments list comprising:
353  *
354  *   - struct vdev_netvsc_ctx *ctx:
355  *     Context to associate network interface with.
356  *
357  * @return
358  *   A nonzero value when interface matches, 0 otherwise or in case of
359  *   error.
360  */
361 static int
362 vdev_netvsc_device_probe(const struct if_nameindex *iface,
363                     const struct ether_addr *eth_addr,
364                     va_list ap)
365 {
366         struct vdev_netvsc_ctx *ctx = va_arg(ap, struct vdev_netvsc_ctx *);
367         char buf[RTE_MAX(sizeof(ctx->yield), 256u)];
368         const char *addr;
369         size_t len;
370         int ret;
371
372         /* Skip non-matching or unwanted NetVSC interfaces. */
373         if (ctx->if_index == iface->if_index) {
374                 if (!strcmp(ctx->if_name, iface->if_name))
375                         return 0;
376                 DRV_LOG(DEBUG,
377                         "NetVSC interface \"%s\" (index %u) renamed \"%s\"",
378                         ctx->if_name, ctx->if_index, iface->if_name);
379                 strncpy(ctx->if_name, iface->if_name, sizeof(ctx->if_name));
380                 return 0;
381         }
382         if (vdev_netvsc_iface_is_netvsc(iface))
383                 return 0;
384         if (!is_same_ether_addr(eth_addr, &ctx->if_addr))
385                 return 0;
386         /* Look for associated PCI device. */
387         ret = vdev_netvsc_sysfs_readlink(buf, sizeof(buf), iface->if_name,
388                                          "device/subsystem");
389         if (ret)
390                 return 0;
391         addr = strrchr(buf, '/');
392         addr = addr ? addr + 1 : buf;
393         if (strcmp(addr, "pci"))
394                 return 0;
395         ret = vdev_netvsc_sysfs_readlink(buf, sizeof(buf), iface->if_name,
396                                          "device");
397         if (ret)
398                 return 0;
399         addr = strrchr(buf, '/');
400         addr = addr ? addr + 1 : buf;
401         len = strlen(addr);
402         if (!len)
403                 return 0;
404         /* Send PCI device argument to fail-safe PMD instance. */
405         if (strcmp(addr, ctx->yield))
406                 DRV_LOG(DEBUG, "associating PCI device \"%s\" with NetVSC"
407                         " interface \"%s\" (index %u)", addr, ctx->if_name,
408                         ctx->if_index);
409         memmove(buf, addr, len + 1);
410         addr = buf;
411         buf[len] = '\n';
412         ret = write(ctx->pipe[1], addr, len + 1);
413         buf[len] = '\0';
414         if (ret == -1) {
415                 if (errno == EINTR || errno == EAGAIN)
416                         return 1;
417                 DRV_LOG(WARNING, "cannot associate PCI device name \"%s\" with"
418                         " interface \"%s\": %s", addr, ctx->if_name,
419                         rte_strerror(errno));
420                 return 1;
421         }
422         if ((size_t)ret != len + 1) {
423                 /*
424                  * Attempt to override previous partial write, no need to
425                  * recover if that fails.
426                  */
427                 ret = write(ctx->pipe[1], "\n", 1);
428                 (void)ret;
429                 return 1;
430         }
431         fsync(ctx->pipe[1]);
432         memcpy(ctx->yield, addr, len + 1);
433         return 1;
434 }
435
436 /**
437  * Alarm callback that regularly probes system network interfaces.
438  *
439  * This callback runs at a frequency determined by VDEV_NETVSC_PROBE_MS as
440  * long as an vdev_netvsc context instance exists.
441  *
442  * @param arg
443  *   Ignored.
444  */
445 static void
446 vdev_netvsc_alarm(__rte_unused void *arg)
447 {
448         struct vdev_netvsc_ctx *ctx;
449         int ret;
450
451         LIST_FOREACH(ctx, &vdev_netvsc_ctx_list, entry) {
452                 ret = vdev_netvsc_foreach_iface(vdev_netvsc_device_probe, ctx);
453                 if (ret < 0)
454                         break;
455         }
456         if (!vdev_netvsc_ctx_count)
457                 return;
458         ret = rte_eal_alarm_set(VDEV_NETVSC_PROBE_MS * 1000,
459                                 vdev_netvsc_alarm, NULL);
460         if (ret < 0) {
461                 DRV_LOG(ERR, "unable to reschedule alarm callback: %s",
462                         rte_strerror(-ret));
463         }
464 }
465
466 /**
467  * Probe a NetVSC interface to generate a vdev_netvsc context from.
468  *
469  * This function instantiates vdev_netvsc contexts either for all NetVSC
470  * devices found on the system or only a subset provided as device
471  * arguments.
472  *
473  * It is normally used with vdev_netvsc_foreach_iface().
474  *
475  * @param[in] iface
476  *   Pointer to netdevice description structure (name and index).
477  * @param[in] eth_addr
478  *   MAC address associated with @p iface.
479  * @param ap
480  *   Variable arguments list comprising:
481  *
482  *   - const char *name:
483  *     Name associated with current driver instance.
484  *
485  *   - struct rte_kvargs *kvargs:
486  *     Device arguments provided to current driver instance.
487  *
488  *   - int force:
489  *     Accept specified interface even if not detected as NetVSC.
490  *
491  *   - unsigned int specified:
492  *     Number of specific netdevices provided as device arguments.
493  *
494  *   - unsigned int *matched:
495  *     The number of specified netdevices matched by this function.
496  *
497  * @return
498  *   A nonzero value when interface matches, 0 otherwise or in case of
499  *   error.
500  */
501 static int
502 vdev_netvsc_netvsc_probe(const struct if_nameindex *iface,
503                          const struct ether_addr *eth_addr,
504                          va_list ap)
505 {
506         const char *name = va_arg(ap, const char *);
507         struct rte_kvargs *kvargs = va_arg(ap, struct rte_kvargs *);
508         int force = va_arg(ap, int);
509         unsigned int specified = va_arg(ap, unsigned int);
510         unsigned int *matched = va_arg(ap, unsigned int *);
511         unsigned int i;
512         struct vdev_netvsc_ctx *ctx;
513         int ret;
514
515         /* Probe all interfaces when none are specified. */
516         if (specified) {
517                 for (i = 0; i != kvargs->count; ++i) {
518                         const struct rte_kvargs_pair *pair = &kvargs->pairs[i];
519
520                         if (!strcmp(pair->key, VDEV_NETVSC_ARG_IFACE)) {
521                                 if (!strcmp(pair->value, iface->if_name))
522                                         break;
523                         } else if (!strcmp(pair->key, VDEV_NETVSC_ARG_MAC)) {
524                                 struct ether_addr tmp;
525
526                                 if (sscanf(pair->value,
527                                            "%" SCNx8 ":%" SCNx8 ":%" SCNx8 ":"
528                                            "%" SCNx8 ":%" SCNx8 ":%" SCNx8,
529                                            &tmp.addr_bytes[0],
530                                            &tmp.addr_bytes[1],
531                                            &tmp.addr_bytes[2],
532                                            &tmp.addr_bytes[3],
533                                            &tmp.addr_bytes[4],
534                                            &tmp.addr_bytes[5]) != 6) {
535                                         DRV_LOG(ERR,
536                                                 "invalid MAC address format"
537                                                 " \"%s\"",
538                                                 pair->value);
539                                         return -EINVAL;
540                                 }
541                                 if (is_same_ether_addr(eth_addr, &tmp))
542                                         break;
543                         }
544                 }
545                 if (i == kvargs->count)
546                         return 0;
547                 ++(*matched);
548         }
549         /* Weed out interfaces already handled. */
550         LIST_FOREACH(ctx, &vdev_netvsc_ctx_list, entry)
551                 if (ctx->if_index == iface->if_index)
552                         break;
553         if (ctx) {
554                 if (!specified)
555                         return 0;
556                 DRV_LOG(WARNING,
557                         "interface \"%s\" (index %u) is already handled,"
558                         " skipping",
559                         iface->if_name, iface->if_index);
560                 return 0;
561         }
562         if (!vdev_netvsc_iface_is_netvsc(iface)) {
563                 if (!specified || !force)
564                         return 0;
565                 DRV_LOG(WARNING,
566                         "using non-NetVSC interface \"%s\" (index %u)",
567                         iface->if_name, iface->if_index);
568         }
569         /* Routed NetVSC should not be probed. */
570         if (vdev_netvsc_has_route(iface, AF_INET) ||
571             vdev_netvsc_has_route(iface, AF_INET6)) {
572                 if (!specified || !force)
573                         return 0;
574                 DRV_LOG(WARNING, "probably using routed NetVSC interface \"%s\""
575                         " (index %u)", iface->if_name, iface->if_index);
576         }
577         /* Create interface context. */
578         ctx = calloc(1, sizeof(*ctx));
579         if (!ctx) {
580                 ret = -errno;
581                 DRV_LOG(ERR, "cannot allocate context for interface \"%s\": %s",
582                         iface->if_name, rte_strerror(errno));
583                 goto error;
584         }
585         ctx->id = vdev_netvsc_ctx_count;
586         strncpy(ctx->if_name, iface->if_name, sizeof(ctx->if_name));
587         ctx->if_index = iface->if_index;
588         ctx->if_addr = *eth_addr;
589         ctx->pipe[0] = -1;
590         ctx->pipe[1] = -1;
591         ctx->yield[0] = '\0';
592         if (pipe(ctx->pipe) == -1) {
593                 ret = -errno;
594                 DRV_LOG(ERR,
595                         "cannot allocate control pipe for interface \"%s\": %s",
596                         ctx->if_name, rte_strerror(errno));
597                 goto error;
598         }
599         for (i = 0; i != RTE_DIM(ctx->pipe); ++i) {
600                 int flf = fcntl(ctx->pipe[i], F_GETFL);
601
602                 if (flf != -1 &&
603                     fcntl(ctx->pipe[i], F_SETFL, flf | O_NONBLOCK) != -1)
604                         continue;
605                 ret = -errno;
606                 DRV_LOG(ERR, "cannot toggle non-blocking flag on control file"
607                         " descriptor #%u (%d): %s", i, ctx->pipe[i],
608                         rte_strerror(errno));
609                 goto error;
610         }
611         /* Generate virtual device name and arguments. */
612         i = 0;
613         ret = snprintf(ctx->name, sizeof(ctx->name), "%s_id%u",
614                        name, ctx->id);
615         if (ret == -1 || (size_t)ret >= sizeof(ctx->name))
616                 ++i;
617         ret = snprintf(ctx->devname, sizeof(ctx->devname), "net_failsafe_%s",
618                        ctx->name);
619         if (ret == -1 || (size_t)ret >= sizeof(ctx->devname))
620                 ++i;
621         ret = snprintf(ctx->devargs, sizeof(ctx->devargs),
622                        "fd(%d),dev(net_tap_%s,remote=%s)",
623                        ctx->pipe[0], ctx->name, ctx->if_name);
624         if (ret == -1 || (size_t)ret >= sizeof(ctx->devargs))
625                 ++i;
626         if (i) {
627                 ret = -ENOBUFS;
628                 DRV_LOG(ERR, "generated virtual device name or argument list"
629                         " too long for interface \"%s\"", ctx->if_name);
630                 goto error;
631         }
632         /* Request virtual device generation. */
633         DRV_LOG(DEBUG, "generating virtual device \"%s\" with arguments \"%s\"",
634                 ctx->devname, ctx->devargs);
635         vdev_netvsc_foreach_iface(vdev_netvsc_device_probe, ctx);
636         ret = rte_eal_hotplug_add("vdev", ctx->devname, ctx->devargs);
637         if (ret)
638                 goto error;
639         LIST_INSERT_HEAD(&vdev_netvsc_ctx_list, ctx, entry);
640         ++vdev_netvsc_ctx_count;
641         DRV_LOG(DEBUG, "added NetVSC interface \"%s\" to context list",
642                 ctx->if_name);
643         return 0;
644 error:
645         if (ctx)
646                 vdev_netvsc_ctx_destroy(ctx);
647         return ret;
648 }
649
650 /**
651  * Probe NetVSC interfaces.
652  *
653  * This function probes system netdevices according to the specified device
654  * arguments and starts a periodic alarm callback to notify the resulting
655  * fail-safe PMD instances of their sub-devices whereabouts.
656  *
657  * @param dev
658  *   Virtual device context for driver instance.
659  *
660  * @return
661  *    Always 0, even in case of errors.
662  */
663 static int
664 vdev_netvsc_vdev_probe(struct rte_vdev_device *dev)
665 {
666         static const char *const vdev_netvsc_arg[] = {
667                 VDEV_NETVSC_ARG_IFACE,
668                 VDEV_NETVSC_ARG_MAC,
669                 VDEV_NETVSC_ARG_FORCE,
670                 VDEV_NETVSC_ARG_IGNORE,
671                 NULL,
672         };
673         const char *name = rte_vdev_device_name(dev);
674         const char *args = rte_vdev_device_args(dev);
675         struct rte_kvargs *kvargs = rte_kvargs_parse(args ? args : "",
676                                                      vdev_netvsc_arg);
677         unsigned int specified = 0;
678         unsigned int matched = 0;
679         int force = 0;
680         int ignore = 0;
681         unsigned int i;
682         int ret;
683
684         DRV_LOG(DEBUG, "invoked as \"%s\", using arguments \"%s\"", name, args);
685         if (!kvargs) {
686                 DRV_LOG(ERR, "cannot parse arguments list");
687                 goto error;
688         }
689         for (i = 0; i != kvargs->count; ++i) {
690                 const struct rte_kvargs_pair *pair = &kvargs->pairs[i];
691
692                 if (!strcmp(pair->key, VDEV_NETVSC_ARG_FORCE))
693                         force = !!atoi(pair->value);
694                 else if (!strcmp(pair->key, VDEV_NETVSC_ARG_IGNORE))
695                         ignore = !!atoi(pair->value);
696                 else if (!strcmp(pair->key, VDEV_NETVSC_ARG_IFACE) ||
697                          !strcmp(pair->key, VDEV_NETVSC_ARG_MAC))
698                         ++specified;
699         }
700         if (ignore) {
701                 if (kvargs)
702                         rte_kvargs_free(kvargs);
703                 return 0;
704         }
705         rte_eal_alarm_cancel(vdev_netvsc_alarm, NULL);
706         /* Gather interfaces. */
707         ret = vdev_netvsc_foreach_iface(vdev_netvsc_netvsc_probe, name, kvargs,
708                                         force, specified, &matched);
709         if (ret < 0)
710                 goto error;
711         if (matched < specified)
712                 DRV_LOG(WARNING,
713                         "some of the specified parameters did not match"
714                         " recognized network interfaces");
715         ret = rte_eal_alarm_set(VDEV_NETVSC_PROBE_MS * 1000,
716                                 vdev_netvsc_alarm, NULL);
717         if (ret < 0) {
718                 DRV_LOG(ERR, "unable to schedule alarm callback: %s",
719                         rte_strerror(-ret));
720                 goto error;
721         }
722 error:
723         if (kvargs)
724                 rte_kvargs_free(kvargs);
725         ++vdev_netvsc_ctx_inst;
726         return 0;
727 }
728
729 /**
730  * Remove driver instance.
731  *
732  * The alarm callback and underlying vdev_netvsc context instances are only
733  * destroyed after the last PMD instance is removed.
734  *
735  * @param dev
736  *   Virtual device context for driver instance.
737  *
738  * @return
739  *   Always 0.
740  */
741 static int
742 vdev_netvsc_vdev_remove(__rte_unused struct rte_vdev_device *dev)
743 {
744         if (--vdev_netvsc_ctx_inst)
745                 return 0;
746         rte_eal_alarm_cancel(vdev_netvsc_alarm, NULL);
747         while (!LIST_EMPTY(&vdev_netvsc_ctx_list)) {
748                 struct vdev_netvsc_ctx *ctx = LIST_FIRST(&vdev_netvsc_ctx_list);
749
750                 LIST_REMOVE(ctx, entry);
751                 --vdev_netvsc_ctx_count;
752                 vdev_netvsc_ctx_destroy(ctx);
753         }
754         return 0;
755 }
756
757 /** Virtual device descriptor. */
758 static struct rte_vdev_driver vdev_netvsc_vdev = {
759         .probe = vdev_netvsc_vdev_probe,
760         .remove = vdev_netvsc_vdev_remove,
761 };
762
763 RTE_PMD_REGISTER_VDEV(VDEV_NETVSC_DRIVER, vdev_netvsc_vdev);
764 RTE_PMD_REGISTER_ALIAS(VDEV_NETVSC_DRIVER, eth_vdev_netvsc);
765 RTE_PMD_REGISTER_PARAM_STRING(net_vdev_netvsc,
766                               VDEV_NETVSC_ARG_IFACE "=<string> "
767                               VDEV_NETVSC_ARG_MAC "=<string> "
768                               VDEV_NETVSC_ARG_FORCE "=<int> "
769                               VDEV_NETVSC_ARG_IGNORE "=<int>");
770
771 /** Initialize driver log type. */
772 RTE_INIT(vdev_netvsc_init_log)
773 {
774         vdev_netvsc_logtype = rte_log_register("pmd.vdev_netvsc");
775         if (vdev_netvsc_logtype >= 0)
776                 rte_log_set_level(vdev_netvsc_logtype, RTE_LOG_NOTICE);
777 }
778
779 /** Compare function for vdev find device operation. */
780 static int
781 vdev_netvsc_cmp_rte_device(const struct rte_device *dev1,
782                            __rte_unused const void *_dev2)
783 {
784         return strcmp(dev1->devargs->name, VDEV_NETVSC_DRIVER_NAME);
785 }
786
787 /**
788  * A callback called by vdev bus scan function to ensure this driver probing
789  * automatically in Hyper-V VM system unless it already exists in the
790  * devargs list.
791  */
792 static void
793 vdev_netvsc_scan_callback(__rte_unused void *arg)
794 {
795         struct rte_vdev_device *dev;
796         struct rte_devargs *devargs;
797         struct rte_bus *vbus = rte_bus_find_by_name("vdev");
798
799         TAILQ_FOREACH(devargs, &devargs_list, next)
800                 if (!strcmp(devargs->name, VDEV_NETVSC_DRIVER_NAME))
801                         return;
802         dev = (struct rte_vdev_device *)vbus->find_device(NULL,
803                 vdev_netvsc_cmp_rte_device, VDEV_NETVSC_DRIVER_NAME);
804         if (dev)
805                 return;
806         if (rte_eal_devargs_add(RTE_DEVTYPE_VIRTUAL, VDEV_NETVSC_DRIVER_NAME))
807                 DRV_LOG(ERR, "unable to add netvsc devargs.");
808 }
809
810 /** Initialize the custom scan. */
811 RTE_INIT(vdev_netvsc_custom_scan_add)
812 {
813         if (rte_hypervisor_get() == RTE_HYPERVISOR_HYPERV)
814                 rte_vdev_add_custom_scan(vdev_netvsc_scan_callback, NULL);
815 }