From: Intel Date: Mon, 3 Jun 2013 00:00:00 +0000 (+0000) Subject: power: initial import X-Git-Tag: spdx-start~11254 X-Git-Url: http://git.droids-corp.org/?a=commitdiff_plain;h=d7937e2e3d12;p=dpdk.git power: initial import Signed-off-by: Intel --- diff --git a/app/test/Makefile b/app/test/Makefile index adc57ef4d1..d457ab45dd 100755 --- a/app/test/Makefile +++ b/app/test/Makefile @@ -87,6 +87,7 @@ SRCS-$(CONFIG_RTE_APP_TEST) += test_cmdline_string.c SRCS-$(CONFIG_RTE_APP_TEST) += test_cmdline_lib.c SRCS-$(CONFIG_RTE_APP_TEST) += test_pmac_pm.c SRCS-$(CONFIG_RTE_APP_TEST) += test_pmac_acl.c +SRCS-$(CONFIG_RTE_APP_TEST) += test_power.c CFLAGS += -O3 CFLAGS += $(WERROR_FLAGS) diff --git a/app/test/autotest_data.py b/app/test/autotest_data.py index ce2dc12e8d..f64ea9cec2 100755 --- a/app/test/autotest_data.py +++ b/app/test/autotest_data.py @@ -363,6 +363,19 @@ non_parallel_test_group_list = [ }, ] }, +{ + "Prefix" : "power", + "Memory" : all_sockets(512), + "Tests" : + [ + { + "Name" : "Power autotest", + "Command" : "power_autotest", + "Func" : default_autotest, + "Report" : None, + }, + ] +}, { "Prefix" : "lpm6", "Memory" : "512", diff --git a/app/test/commands.c b/app/test/commands.c index 92bf88b9fb..ae94badafd 100755 --- a/app/test/commands.c +++ b/app/test/commands.c @@ -171,6 +171,8 @@ static void cmd_autotest_parsed(void *parsed_result, ret |= test_pmac_pm(); if (all || !strcmp(res->autotest, "acl_autotest")) ret |= test_pmac_acl(); + if (all || !strcmp(res->autotest, "power_autotest")) + ret |= test_power(); if (ret == 0) printf("Test OK\n"); @@ -200,7 +202,7 @@ cmdline_parse_token_string_t cmd_autotest_autotest = "cmdline_autotest#func_reentrancy_autotest#" "mempool_perf_autotest#hash_perf_autotest#" "memcpy_perf_autotest#pm_autotest#" - "acl_autotest#" + "acl_autotest#power_autotest#" "all_autotests"); cmdline_parse_inst_t cmd_autotest = { diff --git a/app/test/test.h b/app/test/test.h index ab7a868978..632860bfcd 100755 --- a/app/test/test.h +++ b/app/test/test.h @@ -86,6 +86,7 @@ int test_cmdline(void); int test_func_reentrancy(void); int test_pmac_pm(void); int test_pmac_acl(void); +int test_power(void); int test_pci_run; diff --git a/app/test/test_power.c b/app/test/test_power.c new file mode 100644 index 0000000000..1e6e0d5e09 --- /dev/null +++ b/app/test/test_power.c @@ -0,0 +1,487 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2013 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include + +#include + +#include "test.h" + +#ifdef RTE_LIBRTE_POWER + +#include + +#define TEST_POWER_LCORE_ID 2U +#define TEST_POWER_LCORE_INVALID ((unsigned)RTE_MAX_LCORE) +#define TEST_POWER_FREQS_NUM_MAX ((unsigned)RTE_MAX_LCORE_FREQS) + +#define TEST_POWER_SYSFILE_CUR_FREQ \ + "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq" + +static uint32_t total_freq_num; +static uint32_t freqs[TEST_POWER_FREQS_NUM_MAX]; + +static int +check_cur_freq(unsigned lcore_id, uint32_t idx) +{ +#define TEST_POWER_CONVERT_TO_DECIMAL 10 + FILE *f; + char fullpath[PATH_MAX]; + char buf[BUFSIZ]; + uint32_t cur_freq; + int ret = -1; + + if (rte_snprintf(fullpath, sizeof(fullpath), + TEST_POWER_SYSFILE_CUR_FREQ, lcore_id) < 0) { + return 0; + } + f = fopen(fullpath, "r"); + if (f == NULL) { + return 0; + } + if (fgets(buf, sizeof(buf), f) == NULL) { + goto fail_get_cur_freq; + } + cur_freq = strtoul(buf, NULL, TEST_POWER_CONVERT_TO_DECIMAL); + ret = (freqs[idx] == cur_freq ? 0 : -1); + +fail_get_cur_freq: + fclose(f); + + return ret; +} + +/* Check rte_power_freqs() */ +static int +check_power_freqs(void) +{ + uint32_t ret; + + total_freq_num = 0; + memset(freqs, 0, sizeof(freqs)); + + /* test with an invalid lcore id */ + ret = rte_power_freqs(TEST_POWER_LCORE_INVALID, freqs, + TEST_POWER_FREQS_NUM_MAX); + if (ret > 0) { + printf("Unexpectedly get available freqs successfully on " + "lcore %u\n", TEST_POWER_LCORE_INVALID); + return -1; + } + + /* test with NULL buffer to save available freqs */ + ret = rte_power_freqs(TEST_POWER_LCORE_ID, NULL, + TEST_POWER_FREQS_NUM_MAX); + if (ret > 0) { + printf("Unexpectedly get available freqs successfully with " + "NULL buffer on lcore %u\n", TEST_POWER_LCORE_ID); + return -1; + } + + /* test of getting zero number of freqs */ + ret = rte_power_freqs(TEST_POWER_LCORE_ID, freqs, 0); + if (ret > 0) { + printf("Unexpectedly get available freqs successfully with " + "zero buffer size on lcore %u\n", TEST_POWER_LCORE_ID); + return -1; + } + + /* test with all valid input parameters */ + ret = rte_power_freqs(TEST_POWER_LCORE_ID, freqs, + TEST_POWER_FREQS_NUM_MAX); + if (ret == 0 || ret > TEST_POWER_FREQS_NUM_MAX) { + printf("Fail to get available freqs on lcore %u\n", + TEST_POWER_LCORE_ID); + return -1; + } + + /* Save the total number of available freqs */ + total_freq_num = ret; + + return 0; +} + +/* Check rte_power_get_freq() */ +static int +check_power_get_freq(void) +{ + int ret; + uint32_t count; + + /* test with an invalid lcore id */ + count = rte_power_get_freq(TEST_POWER_LCORE_INVALID); + if (count < TEST_POWER_FREQS_NUM_MAX) { + printf("Unexpectedly get freq index successfully on " + "lcore %u\n", TEST_POWER_LCORE_INVALID); + return -1; + } + + count = rte_power_get_freq(TEST_POWER_LCORE_ID); + if (count >= TEST_POWER_FREQS_NUM_MAX) { + printf("Fail to get the freq index on lcore %u\n", + TEST_POWER_LCORE_ID); + return -1; + } + + /* Check the current frequency */ + ret = check_cur_freq(TEST_POWER_LCORE_ID, count); + if (ret < 0) + return -1; + + return 0; +} + +/* Check rte_power_set_freq() */ +static int +check_power_set_freq(void) +{ + int ret; + + /* test with an invalid lcore id */ + ret = rte_power_set_freq(TEST_POWER_LCORE_INVALID, 0); + if (ret >= 0) { + printf("Unexpectedly set freq index successfully on " + "lcore %u\n", TEST_POWER_LCORE_INVALID); + return -1; + } + + /* test with an invalid freq index */ + ret = rte_power_set_freq(TEST_POWER_LCORE_ID, + TEST_POWER_FREQS_NUM_MAX); + if (ret >= 0) { + printf("Unexpectedly set an invalid freq index (%u)" + "successfully on lcore %u\n", TEST_POWER_FREQS_NUM_MAX, + TEST_POWER_LCORE_ID); + return -1; + } + + /** + * test with an invalid freq index which is right one bigger than + * total number of freqs + */ + ret = rte_power_set_freq(TEST_POWER_LCORE_ID, total_freq_num); + if (ret >= 0) { + printf("Unexpectedly set an invalid freq index (%u)" + "successfully on lcore %u\n", total_freq_num, + TEST_POWER_LCORE_ID); + return -1; + } + ret = rte_power_set_freq(TEST_POWER_LCORE_ID, 3); + if (ret < 0) { + printf("Fail to set freq index on lcore %u\n", + TEST_POWER_LCORE_ID); + return -1; + } + + /* Check the current frequency */ + ret = check_cur_freq(TEST_POWER_LCORE_ID, 3); + if (ret < 0) + return -1; + + return 0; +} + +/* Check rte_power_freq_down() */ +static int +check_power_freq_down(void) +{ + int ret; + + /* test with an invalid lcore id */ + ret = rte_power_freq_down(TEST_POWER_LCORE_INVALID); + if (ret >= 0) { + printf("Unexpectedly scale down successfully the freq on " + "lcore %u\n", TEST_POWER_LCORE_INVALID); + return -1; + } + + /* Scale down to min and then scale down one step */ + ret = rte_power_freq_min(TEST_POWER_LCORE_ID); + if (ret < 0) { + printf("Fail to scale down the freq to min on lcore %u\n", + TEST_POWER_LCORE_ID); + return -1; + } + ret = rte_power_freq_down(TEST_POWER_LCORE_ID); + if (ret < 0) { + printf("Fail to scale down the freq on lcore %u\n", + TEST_POWER_LCORE_ID); + return -1; + } + + /* Check the current frequency */ + ret = check_cur_freq(TEST_POWER_LCORE_ID, total_freq_num - 1); + if (ret < 0) + return -1; + + /* Scale up to max and then scale down one step */ + ret = rte_power_freq_max(TEST_POWER_LCORE_ID); + if (ret < 0) { + printf("Fail to scale up the freq to max on lcore %u\n", + TEST_POWER_LCORE_ID); + return -1; + } + ret = rte_power_freq_down(TEST_POWER_LCORE_ID); + if (ret < 0) { + printf ("Fail to scale down the freq on lcore %u\n", + TEST_POWER_LCORE_ID); + return -1; + } + + /* Check the current frequency */ + ret = check_cur_freq(TEST_POWER_LCORE_ID, 1); + if (ret < 0) + return -1; + + return 0; +} + +/* Check rte_power_freq_up() */ +static int +check_power_freq_up(void) +{ + int ret; + + /* test with an invalid lcore id */ + ret = rte_power_freq_up(TEST_POWER_LCORE_INVALID); + if (ret >= 0) { + printf("Unexpectedly scale up successfully the freq on %u\n", + TEST_POWER_LCORE_INVALID); + return -1; + } + + /* Scale down to min and then scale up one step */ + ret = rte_power_freq_min(TEST_POWER_LCORE_ID); + if (ret < 0) { + printf("Fail to scale down the freq to min on lcore %u\n", + TEST_POWER_LCORE_ID); + return -1; + } + ret = rte_power_freq_up(TEST_POWER_LCORE_ID); + if (ret < 0) { + printf("Fail to scale up the freq on lcore %u\n", + TEST_POWER_LCORE_ID); + return -1; + } + + /* Check the current frequency */ + ret = check_cur_freq(TEST_POWER_LCORE_ID, total_freq_num - 2); + if (ret < 0) + return -1; + + /* Scale up to max and then scale up one step */ + ret = rte_power_freq_max(TEST_POWER_LCORE_ID); + if (ret < 0) { + printf("Fail to scale up the freq to max on lcore %u\n", + TEST_POWER_LCORE_ID); + return -1; + } + ret = rte_power_freq_up(TEST_POWER_LCORE_ID); + if (ret < 0) { + printf("Fail to scale up the freq on lcore %u\n", + TEST_POWER_LCORE_ID); + return -1; + } + + /* Check the current frequency */ + ret = check_cur_freq(TEST_POWER_LCORE_ID, 0); + if (ret < 0) + return -1; + + return 0; +} + +/* Check rte_power_freq_max() */ +static int +check_power_freq_max(void) +{ + int ret; + + /* test with an invalid lcore id */ + ret = rte_power_freq_max(TEST_POWER_LCORE_INVALID); + if (ret >= 0) { + printf("Unexpectedly scale up successfully the freq to max on " + "lcore %u\n", TEST_POWER_LCORE_INVALID); + return -1; + } + ret = rte_power_freq_max(TEST_POWER_LCORE_ID); + if (ret < 0) { + printf("Fail to scale up the freq to max on lcore %u\n", + TEST_POWER_LCORE_ID); + return -1; + } + + /* Check the current frequency */ + ret = check_cur_freq(TEST_POWER_LCORE_ID, 0); + if (ret < 0) + return -1; + + return 0; +} + +/* Check rte_power_freq_min() */ +static int +check_power_freq_min(void) +{ + int ret; + + /* test with an invalid lcore id */ + ret = rte_power_freq_min(TEST_POWER_LCORE_INVALID); + if (ret >= 0) { + printf("Unexpectedly scale down successfully the freq to min " + "on lcore %u\n", TEST_POWER_LCORE_INVALID); + return -1; + } + ret = rte_power_freq_min(TEST_POWER_LCORE_ID); + if (ret < 0) { + printf("Fail to scale down the freq to min on lcore %u\n", + TEST_POWER_LCORE_ID); + return -1; + } + + /* Check the current frequency */ + ret = check_cur_freq(TEST_POWER_LCORE_ID, total_freq_num - 1); + if (ret < 0) + return -1; + + return 0; +} + +int +test_power(void) +{ + int ret = -1; + + /* test of init power management for an invalid lcore */ + ret = rte_power_init(TEST_POWER_LCORE_INVALID); + if (ret == 0) { + printf("Unexpectedly initialise power management successfully " + "for lcore %u\n", TEST_POWER_LCORE_INVALID); + return -1; + } + + ret = rte_power_init(TEST_POWER_LCORE_ID); + if (ret < 0) { + printf("Cannot initialise power management for lcore %u\n", + TEST_POWER_LCORE_ID); + return -1; + } + + /** + * test of initialising power management for the lcore which has + * been initialised + */ + ret = rte_power_init(TEST_POWER_LCORE_ID); + if (ret == 0) { + printf("Unexpectedly init successfully power twice on " + "lcore %u\n", TEST_POWER_LCORE_ID); + return -1; + } + + ret = check_power_freqs(); + if (ret < 0) + goto fail_all; + + ret = check_power_get_freq(); + if (ret < 0) + goto fail_all; + + ret = check_power_set_freq(); + if (ret < 0) + goto fail_all; + + ret = check_power_freq_down(); + if (ret < 0) + goto fail_all; + + ret = check_power_freq_up(); + if (ret < 0) + goto fail_all; + + ret = check_power_freq_max(); + if (ret < 0) + goto fail_all; + + ret = check_power_freq_min(); + if (ret < 0) + goto fail_all; + + ret = rte_power_exit(TEST_POWER_LCORE_ID); + if (ret < 0) { + printf("Cannot exit power management for lcore %u\n", + TEST_POWER_LCORE_ID); + return -1; + } + + /** + * test of exiting power management for the lcore which has been exited + */ + ret = rte_power_exit(TEST_POWER_LCORE_ID); + if (ret == 0) { + printf("Unexpectedly exit successfully power management twice " + "on lcore %u\n", TEST_POWER_LCORE_ID); + return -1; + } + + /* test of exit power management for an invalid lcore */ + ret = rte_power_exit(TEST_POWER_LCORE_INVALID); + if (ret == 0) { + printf("Unpectedly exit power management successfully for " + "lcore %u\n", TEST_POWER_LCORE_INVALID); + return -1; + } + + return 0; + +fail_all: + rte_power_exit(TEST_POWER_LCORE_ID); + + return -1; +} + +#else /* RTE_LIBRTE_POWER */ + +int +test_power(void) +{ + printf("The power library is not included in this build\n"); + return 0; +} + +#endif /* RTE_LIBRTE_POWER */ + diff --git a/config/defconfig_i686-default-linuxapp-gcc b/config/defconfig_i686-default-linuxapp-gcc index 57d768f838..686f1d15cb 100644 --- a/config/defconfig_i686-default-linuxapp-gcc +++ b/config/defconfig_i686-default-linuxapp-gcc @@ -216,6 +216,13 @@ CONFIG_RTE_LIBRTE_HASH_DEBUG=n CONFIG_RTE_LIBRTE_LPM=y CONFIG_RTE_LIBRTE_LPM_DEBUG=n +# +# Compile librte_power +# +CONFIG_RTE_LIBRTE_POWER=y +CONFIG_RTE_LIBRTE_POWER_DEBUG=n +CONFIG_RTE_MAX_LCORE_FREQS=64 + # # Compile librte_net # diff --git a/config/defconfig_i686-default-linuxapp-icc b/config/defconfig_i686-default-linuxapp-icc index 44f1b054e3..5247d5b147 100644 --- a/config/defconfig_i686-default-linuxapp-icc +++ b/config/defconfig_i686-default-linuxapp-icc @@ -217,6 +217,13 @@ CONFIG_RTE_LIBRTE_HASH_DEBUG=n CONFIG_RTE_LIBRTE_LPM=y CONFIG_RTE_LIBRTE_LPM_DEBUG=n +# +# Compile librte_power +# +CONFIG_RTE_LIBRTE_POWER=y +CONFIG_RTE_LIBRTE_POWER_DEBUG=n +CONFIG_RTE_MAX_LCORE_FREQS=64 + # # Compile librte_net # diff --git a/config/defconfig_x86_64-default-linuxapp-gcc b/config/defconfig_x86_64-default-linuxapp-gcc index 8e76034642..dbb1068013 100644 --- a/config/defconfig_x86_64-default-linuxapp-gcc +++ b/config/defconfig_x86_64-default-linuxapp-gcc @@ -217,6 +217,13 @@ CONFIG_RTE_LIBRTE_HASH_DEBUG=n CONFIG_RTE_LIBRTE_LPM=y CONFIG_RTE_LIBRTE_LPM_DEBUG=n +# +# Compile librte_power +# +CONFIG_RTE_LIBRTE_POWER=y +CONFIG_RTE_LIBRTE_POWER_DEBUG=n +CONFIG_RTE_MAX_LCORE_FREQS=64 + # # Compile librte_net # diff --git a/config/defconfig_x86_64-default-linuxapp-icc b/config/defconfig_x86_64-default-linuxapp-icc index edcacf3579..68dd9c3798 100644 --- a/config/defconfig_x86_64-default-linuxapp-icc +++ b/config/defconfig_x86_64-default-linuxapp-icc @@ -217,6 +217,13 @@ CONFIG_RTE_LIBRTE_HASH_DEBUG=n CONFIG_RTE_LIBRTE_LPM=y CONFIG_RTE_LIBRTE_LPM_DEBUG=n +# +# Compile librte_power +# +CONFIG_RTE_LIBRTE_POWER=y +CONFIG_RTE_LIBRTE_POWER_DEBUG=n +CONFIG_RTE_MAX_LCORE_FREQS=64 + # # Compile librte_net # diff --git a/examples/l3fwd-power/Makefile b/examples/l3fwd-power/Makefile new file mode 100644 index 0000000000..80a0993279 --- /dev/null +++ b/examples/l3fwd-power/Makefile @@ -0,0 +1,57 @@ +# BSD LICENSE +# +# Copyright(c) 2010-2013 Intel Corporation. All rights reserved. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +ifeq ($(RTE_SDK),) +$(error "Please define RTE_SDK environment variable") +endif + +# Default target, can be overriden by command line or environment +RTE_TARGET ?= x86_64-default-linuxapp-gcc + +include $(RTE_SDK)/mk/rte.vars.mk + +# binary name +APP = l3fwd-power + +# all source are stored in SRCS-y +SRCS-y := main.c + +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) + +# workaround for a gcc bug with noreturn attribute +# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603 +ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y) +CFLAGS_main.o += -Wno-return-type +endif + +include $(RTE_SDK)/mk/rte.extapp.mk diff --git a/examples/l3fwd-power/main.c b/examples/l3fwd-power/main.c new file mode 100644 index 0000000000..d0b52890cc --- /dev/null +++ b/examples/l3fwd-power/main.c @@ -0,0 +1,1714 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2013 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "main.h" + +#define RTE_LOGTYPE_L3FWD_POWER RTE_LOGTYPE_USER1 + +#define MAX_PKT_BURST 32 + +#define MIN_ZERO_POLL_COUNT 5 + +/* around 100ms at 2 Ghz */ +#define TIMER_RESOLUTION_CYCLES 200000000ULL +/* 100 ms interval */ +#define TIMER_NUMBER_PER_SECOND 10 +/* 100000 us */ +#define SCALING_PERIOD (1000000/TIMER_NUMBER_PER_SECOND) +#define SCALING_DOWN_TIME_RATIO_THRESHOLD 0.25 + +#define APP_LOOKUP_EXACT_MATCH 0 +#define APP_LOOKUP_LPM 1 +#define DO_RFC_1812_CHECKS + +#ifndef APP_LOOKUP_METHOD +#define APP_LOOKUP_METHOD APP_LOOKUP_LPM +#endif + +#if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) +#include +#elif (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) +#include +#else +#error "APP_LOOKUP_METHOD set to incorrect value" +#endif + +#ifndef IPv6_BYTES +#define IPv6_BYTES_FMT "%02x%02x:%02x%02x:%02x%02x:%02x%02x:"\ + "%02x%02x:%02x%02x:%02x%02x:%02x%02x" +#define IPv6_BYTES(addr) \ + addr[0], addr[1], addr[2], addr[3], \ + addr[4], addr[5], addr[6], addr[7], \ + addr[8], addr[9], addr[10], addr[11],\ + addr[12], addr[13],addr[14], addr[15] +#endif + +#define MAX_JUMBO_PKT_LEN 9600 + +#define IPV6_ADDR_LEN 16 + +#define MEMPOOL_CACHE_SIZE 256 + +#define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) + +/* + * This expression is used to calculate the number of mbufs needed depending on + * user input, taking into account memory for rx and tx hardware rings, cache + * per lcore and mtable per port per lcore. RTE_MAX is used to ensure that + * NB_MBUF never goes below a minimum value of 8192. + */ + +#define NB_MBUF RTE_MAX ( \ + (nb_ports*nb_rx_queue*RTE_TEST_RX_DESC_DEFAULT + \ + nb_ports*nb_lcores*MAX_PKT_BURST + \ + nb_ports*n_tx_queue*RTE_TEST_TX_DESC_DEFAULT + \ + nb_lcores*MEMPOOL_CACHE_SIZE), \ + (unsigned)8192) + +/* + * RX and TX Prefetch, Host, and Write-back threshold values should be + * carefully set for optimal performance. Consult the network + * controller's datasheet and supporting DPDK documentation for guidance + * on how these parameters should be set. + */ +#define RX_PTHRESH 8 /**< Default values of RX prefetch threshold reg. */ +#define RX_HTHRESH 8 /**< Default values of RX host threshold reg. */ +#define RX_WTHRESH 4 /**< Default values of RX write-back threshold reg. */ + +/* + * These default values are optimized for use with the Intel(R) 82599 10 GbE + * Controller and the DPDK ixgbe PMD. Consider using other values for other + * network controllers and/or network drivers. + */ +#define TX_PTHRESH 36 /**< Default values of TX prefetch threshold reg. */ +#define TX_HTHRESH 0 /**< Default values of TX host threshold reg. */ +#define TX_WTHRESH 0 /**< Default values of TX write-back threshold reg. */ + +#define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ + +#define NB_SOCKETS 8 + +/* Configure how many packets ahead to prefetch, when reading packets */ +#define PREFETCH_OFFSET 3 + +/* + * Configurable number of RX/TX ring descriptors + */ +#define RTE_TEST_RX_DESC_DEFAULT 128 +#define RTE_TEST_TX_DESC_DEFAULT 512 +static uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT; +static uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT; + +/* ethernet addresses of ports */ +static struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS]; + +/* mask of enabled ports */ +static uint32_t enabled_port_mask = 0; +/* Ports set in promiscuous mode off by default. */ +static int promiscuous_on = 0; +/* NUMA is enabled by default. */ +static int numa_on = 1; + +enum freq_scale_hint_t +{ + FREQ_LOWER = -1, + FREQ_CURRENT = 0, + FREQ_HIGHER = 1, + FREQ_HIGHEST = 2 +}; + +struct mbuf_table { + uint16_t len; + struct rte_mbuf *m_table[MAX_PKT_BURST]; +}; + +struct lcore_rx_queue { + uint8_t port_id; + uint8_t queue_id; + enum freq_scale_hint_t freq_up_hint; + uint32_t zero_rx_packet_count; + uint32_t idle_hint; +} __rte_cache_aligned; + +#define MAX_RX_QUEUE_PER_LCORE 16 +#define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS +#define MAX_RX_QUEUE_PER_PORT 128 + +#define MAX_LCORE_PARAMS 1024 +struct lcore_params { + uint8_t port_id; + uint8_t queue_id; + uint8_t lcore_id; +} __rte_cache_aligned; + +static struct lcore_params lcore_params_array[MAX_LCORE_PARAMS]; +static struct lcore_params lcore_params_array_default[] = { + {0, 0, 2}, + {0, 1, 2}, + {0, 2, 2}, + {1, 0, 2}, + {1, 1, 2}, + {1, 2, 2}, + {2, 0, 2}, + {3, 0, 3}, + {3, 1, 3}, +}; + +static struct lcore_params * lcore_params = lcore_params_array_default; +static uint16_t nb_lcore_params = sizeof(lcore_params_array_default) / + sizeof(lcore_params_array_default[0]); + +static struct rte_eth_conf port_conf = { + .rxmode = { + .max_rx_pkt_len = ETHER_MAX_LEN, + .split_hdr_size = 0, + .header_split = 0, /**< Header Split disabled */ + .hw_ip_checksum = 1, /**< IP checksum offload enabled */ + .hw_vlan_filter = 0, /**< VLAN filtering disabled */ + .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ + .hw_strip_crc = 0, /**< CRC stripped by hardware */ + }, + .rx_adv_conf = { + .rss_conf = { + .rss_key = NULL, + .rss_hf = ETH_RSS_IPV4 | ETH_RSS_IPV6, + }, + }, + .txmode = { + .mq_mode = ETH_DCB_NONE, + }, +}; + +static const struct rte_eth_rxconf rx_conf = { + .rx_thresh = { + .pthresh = RX_PTHRESH, + .hthresh = RX_HTHRESH, + .wthresh = RX_WTHRESH, + }, + .rx_free_thresh = 32, +}; + +static const struct rte_eth_txconf tx_conf = { + .tx_thresh = { + .pthresh = TX_PTHRESH, + .hthresh = TX_HTHRESH, + .wthresh = TX_WTHRESH, + }, + .tx_free_thresh = 0, /* Use PMD default values */ + .tx_rs_thresh = 0, /* Use PMD default values */ + .txq_flags = 0x0, +}; + +static struct rte_mempool * pktmbuf_pool[NB_SOCKETS]; + + +#if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) + +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2 +#include +#define DEFAULT_HASH_FUNC rte_hash_crc +#else +#include +#define DEFAULT_HASH_FUNC rte_jhash +#endif + +struct ipv4_5tuple { + uint32_t ip_dst; + uint32_t ip_src; + uint16_t port_dst; + uint16_t port_src; + uint8_t proto; +} __attribute__((__packed__)); + +struct ipv6_5tuple { + uint8_t ip_dst[IPV6_ADDR_LEN]; + uint8_t ip_src[IPV6_ADDR_LEN]; + uint16_t port_dst; + uint16_t port_src; + uint8_t proto; +} __attribute__((__packed__)); + +struct ipv4_l3fwd_route { + struct ipv4_5tuple key; + uint8_t if_out; +}; + +struct ipv6_l3fwd_route { + struct ipv6_5tuple key; + uint8_t if_out; +}; + +static struct ipv4_l3fwd_route ipv4_l3fwd_route_array[] = { + {{IPv4(100,10,0,1), IPv4(200,10,0,1), 101, 11, IPPROTO_TCP}, 0}, + {{IPv4(100,20,0,2), IPv4(200,20,0,2), 102, 12, IPPROTO_TCP}, 1}, + {{IPv4(100,30,0,3), IPv4(200,30,0,3), 103, 13, IPPROTO_TCP}, 2}, + {{IPv4(100,40,0,4), IPv4(200,40,0,4), 104, 14, IPPROTO_TCP}, 3}, +}; + +static struct ipv6_l3fwd_route ipv6_l3fwd_route_array[] = { + { + { + {0xfe, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x1b, 0x21, 0xff, 0xfe, 0x91, 0x38, 0x05}, + {0xfe, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x1e, 0x67, 0xff, 0xfe, 0x0d, 0xb6, 0x0a}, + 1, 10, IPPROTO_UDP + }, 4 + }, +}; + +typedef struct rte_hash lookup_struct_t; +static lookup_struct_t *ipv4_l3fwd_lookup_struct[NB_SOCKETS]; +static lookup_struct_t *ipv6_l3fwd_lookup_struct[NB_SOCKETS]; + +#define L3FWD_HASH_ENTRIES 1024 + +#define IPV4_L3FWD_NUM_ROUTES \ + (sizeof(ipv4_l3fwd_route_array) / sizeof(ipv4_l3fwd_route_array[0])) + +#define IPV6_L3FWD_NUM_ROUTES \ + (sizeof(ipv6_l3fwd_route_array) / sizeof(ipv6_l3fwd_route_array[0])) + +static uint8_t ipv4_l3fwd_out_if[L3FWD_HASH_ENTRIES] __rte_cache_aligned; +static uint8_t ipv6_l3fwd_out_if[L3FWD_HASH_ENTRIES] __rte_cache_aligned; +#endif + +#if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) +struct ipv4_l3fwd_route { + uint32_t ip; + uint8_t depth; + uint8_t if_out; +}; + +static struct ipv4_l3fwd_route ipv4_l3fwd_route_array[] = { + {IPv4(1,1,1,0), 24, 0}, + {IPv4(2,1,1,0), 24, 1}, + {IPv4(3,1,1,0), 24, 2}, + {IPv4(4,1,1,0), 24, 3}, + {IPv4(5,1,1,0), 24, 4}, + {IPv4(6,1,1,0), 24, 5}, + {IPv4(7,1,1,0), 24, 6}, + {IPv4(8,1,1,0), 24, 7}, +}; + +#define IPV4_L3FWD_NUM_ROUTES \ + (sizeof(ipv4_l3fwd_route_array) / sizeof(ipv4_l3fwd_route_array[0])) + +#define IPV4_L3FWD_LPM_MAX_RULES 1024 + +typedef struct rte_lpm lookup_struct_t; +static lookup_struct_t *ipv4_l3fwd_lookup_struct[NB_SOCKETS]; +#endif + +struct lcore_conf { + uint16_t n_rx_queue; + struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE]; + uint16_t tx_queue_id[RTE_MAX_ETHPORTS]; + struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS]; + lookup_struct_t * ipv4_lookup_struct; + lookup_struct_t * ipv6_lookup_struct; +} __rte_cache_aligned; + +struct lcore_stats { + /* total sleep time in ms since last frequency scaling down */ + uint32_t sleep_time; + /* number of long sleep recently */ + uint32_t nb_long_sleep; + /* freq. scaling up trend */ + uint32_t trend; + /* total packet processed recently */ + uint64_t nb_rx_processed; + /* total iterations looped recently */ + uint64_t nb_iteration_looped; + uint32_t padding[9]; +} __rte_cache_aligned; + +static struct lcore_conf lcore_conf[RTE_MAX_LCORE] __rte_cache_aligned; +static struct lcore_stats stats[RTE_MAX_LCORE] __rte_cache_aligned; +static struct rte_timer power_timers[RTE_MAX_LCORE]; + +static inline uint32_t power_idle_heuristic(uint32_t zero_rx_packet_count); +static inline enum freq_scale_hint_t power_freq_scaleup_heuristic( \ + unsigned lcore_id, uint32_t rx_ring_length); + +/* exit signal handler */ +static void +signal_exit_now(int sigtype) +{ + unsigned lcore_id; + int ret; + + if (sigtype == SIGINT) { + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { + if (rte_lcore_is_enabled(lcore_id) == 0) + continue; + + /* init power management library */ + ret = rte_power_exit(lcore_id); + if (ret) + rte_exit(EXIT_FAILURE, "Power management " + "library de-initialization failed on " + "core%u\n", lcore_id); + } + } + + rte_exit(EXIT_SUCCESS, "User forced exit\n"); +} + +/* Freqency scale down timer callback */ +static void +power_timer_cb(__attribute__((unused)) struct rte_timer *tim, + __attribute__((unused)) void *arg) +{ + uint64_t hz; + float sleep_time_ratio; + unsigned lcore_id = rte_lcore_id(); + + /* accumulate total execution time in us when callback is invoked */ + sleep_time_ratio = (float)(stats[lcore_id].sleep_time) / + (float)SCALING_PERIOD; + + /** + * check whether need to scale down frequency a step if it sleep a lot. + */ + if (sleep_time_ratio >= SCALING_DOWN_TIME_RATIO_THRESHOLD) + rte_power_freq_down(lcore_id); + else if ( (unsigned)(stats[lcore_id].nb_rx_processed / + stats[lcore_id].nb_iteration_looped) < MAX_PKT_BURST) + /** + * scale down a step if average packet per iteration less + * than expectation. + */ + rte_power_freq_down(lcore_id); + + /** + * initialize another timer according to current frequency to ensure + * timer interval is relatively fixed. + */ + hz = rte_get_timer_hz(); + rte_timer_reset(&power_timers[lcore_id], hz/TIMER_NUMBER_PER_SECOND, + SINGLE, lcore_id, power_timer_cb, NULL); + + stats[lcore_id].nb_rx_processed = 0; + stats[lcore_id].nb_iteration_looped = 0; + + stats[lcore_id].sleep_time = 0; +} + +/* Send burst of packets on an output interface */ +static inline int +send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) +{ + struct rte_mbuf **m_table; + int ret; + uint16_t queueid; + + queueid = qconf->tx_queue_id[port]; + m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; + + ret = rte_eth_tx_burst(port, queueid, m_table, n); + if (unlikely(ret < n)) { + do { + rte_pktmbuf_free(m_table[ret]); + } while (++ret < n); + } + + return 0; +} + +/* Enqueue a single packet, and send burst if queue is filled */ +static inline int +send_single_packet(struct rte_mbuf *m, uint8_t port) +{ + uint32_t lcore_id; + uint16_t len; + struct lcore_conf *qconf; + + lcore_id = rte_lcore_id(); + + qconf = &lcore_conf[lcore_id]; + len = qconf->tx_mbufs[port].len; + qconf->tx_mbufs[port].m_table[len] = m; + len++; + + /* enough pkts to be sent */ + if (unlikely(len == MAX_PKT_BURST)) { + send_burst(qconf, MAX_PKT_BURST, port); + len = 0; + } + + qconf->tx_mbufs[port].len = len; + return 0; +} + +#ifdef DO_RFC_1812_CHECKS +static inline int +is_valid_ipv4_pkt(struct ipv4_hdr *pkt, uint32_t link_len) +{ + /* From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2 */ + /* + * 1. The packet length reported by the Link Layer must be large + * enough to hold the minimum length legal IP datagram (20 bytes). + */ + if (link_len < sizeof(struct ipv4_hdr)) + return -1; + + /* 2. The IP checksum must be correct. */ + /* this is checked in H/W */ + + /* + * 3. The IP version number must be 4. If the version number is not 4 + * then the packet may be another version of IP, such as IPng or + * ST-II. + */ + if (((pkt->version_ihl) >> 4) != 4) + return -3; + /* + * 4. The IP header length field must be large enough to hold the + * minimum length legal IP datagram (20 bytes = 5 words). + */ + if ((pkt->version_ihl & 0xf) < 5) + return -4; + + /* + * 5. The IP total length field must be large enough to hold the IP + * datagram header, whose length is specified in the IP header length + * field. + */ + if (rte_cpu_to_be_16(pkt->total_length) < sizeof(struct ipv4_hdr)) + return -5; + + return 0; +} +#endif + +#if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) +static void +print_ipv4_key(struct ipv4_5tuple key) +{ + printf("IP dst = %08x, IP src = %08x, port dst = %d, port src = %d, " + "proto = %d\n", (unsigned)key.ip_dst, (unsigned)key.ip_src, + key.port_dst, key.port_src, key.proto); +} +static void +print_ipv6_key(struct ipv6_5tuple key) +{ + printf( "IP dst = " IPv6_BYTES_FMT ", IP src = " IPv6_BYTES_FMT ", " + "port dst = %d, port src = %d, proto = %d\n", + IPv6_BYTES(key.ip_dst), IPv6_BYTES(key.ip_src), + key.port_dst, key.port_src, key.proto); +} + +static inline uint8_t +get_ipv4_dst_port(struct ipv4_hdr *ipv4_hdr, uint8_t portid, + lookup_struct_t * ipv4_l3fwd_lookup_struct) +{ + struct ipv4_5tuple key; + struct tcp_hdr *tcp; + struct udp_hdr *udp; + int ret = 0; + + key.ip_dst = rte_be_to_cpu_32(ipv4_hdr->dst_addr); + key.ip_src = rte_be_to_cpu_32(ipv4_hdr->src_addr); + key.proto = ipv4_hdr->next_proto_id; + + switch (ipv4_hdr->next_proto_id) { + case IPPROTO_TCP: + tcp = (struct tcp_hdr *)((unsigned char *)ipv4_hdr + + sizeof(struct ipv4_hdr)); + key.port_dst = rte_be_to_cpu_16(tcp->dst_port); + key.port_src = rte_be_to_cpu_16(tcp->src_port); + break; + + case IPPROTO_UDP: + udp = (struct udp_hdr *)((unsigned char *)ipv4_hdr + + sizeof(struct ipv4_hdr)); + key.port_dst = rte_be_to_cpu_16(udp->dst_port); + key.port_src = rte_be_to_cpu_16(udp->src_port); + break; + + default: + key.port_dst = 0; + key.port_src = 0; + break; + } + + /* Find destination port */ + ret = rte_hash_lookup(ipv4_l3fwd_lookup_struct, (const void *)&key); + return (uint8_t)((ret < 0)? portid : ipv4_l3fwd_out_if[ret]); +} + +static inline uint8_t +get_ipv6_dst_port(struct ipv6_hdr *ipv6_hdr, uint8_t portid, + lookup_struct_t *ipv6_l3fwd_lookup_struct) +{ + struct ipv6_5tuple key; + struct tcp_hdr *tcp; + struct udp_hdr *udp; + int ret = 0; + + memcpy(key.ip_dst, ipv6_hdr->dst_addr, IPV6_ADDR_LEN); + memcpy(key.ip_src, ipv6_hdr->src_addr, IPV6_ADDR_LEN); + + key.proto = ipv6_hdr->proto; + + switch (ipv6_hdr->proto) { + case IPPROTO_TCP: + tcp = (struct tcp_hdr *)((unsigned char *) ipv6_hdr + + sizeof(struct ipv6_hdr)); + key.port_dst = rte_be_to_cpu_16(tcp->dst_port); + key.port_src = rte_be_to_cpu_16(tcp->src_port); + break; + + case IPPROTO_UDP: + udp = (struct udp_hdr *)((unsigned char *) ipv6_hdr + + sizeof(struct ipv6_hdr)); + key.port_dst = rte_be_to_cpu_16(udp->dst_port); + key.port_src = rte_be_to_cpu_16(udp->src_port); + break; + + default: + key.port_dst = 0; + key.port_src = 0; + break; + } + + /* Find destination port */ + ret = rte_hash_lookup(ipv6_l3fwd_lookup_struct, (const void *)&key); + return (uint8_t)((ret < 0)? portid : ipv6_l3fwd_out_if[ret]); +} +#endif + +#if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) +static inline uint8_t +get_ipv4_dst_port(struct ipv4_hdr *ipv4_hdr, uint8_t portid, + lookup_struct_t *ipv4_l3fwd_lookup_struct) +{ + uint8_t next_hop; + + return (uint8_t) ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct, + rte_be_to_cpu_32(ipv4_hdr->dst_addr), &next_hop) == 0)? + next_hop : portid); +} +#endif + +static inline void +l3fwd_simple_forward(struct rte_mbuf *m, uint8_t portid, + struct lcore_conf *qconf) +{ + struct ether_hdr *eth_hdr; + struct ipv4_hdr *ipv4_hdr; + void *d_addr_bytes; + uint8_t dst_port; + + eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); + + if (m->ol_flags & PKT_RX_IPV4_HDR) { + /* Handle IPv4 headers.*/ + ipv4_hdr = + (struct ipv4_hdr *)(rte_pktmbuf_mtod(m, unsigned char*) + + sizeof(struct ether_hdr)); + +#ifdef DO_RFC_1812_CHECKS + /* Check to make sure the packet is valid (RFC1812) */ + if (is_valid_ipv4_pkt(ipv4_hdr, m->pkt.pkt_len) < 0) { + rte_pktmbuf_free(m); + return; + } +#endif + + dst_port = get_ipv4_dst_port(ipv4_hdr, portid, + qconf->ipv4_lookup_struct); + if (dst_port >= RTE_MAX_ETHPORTS || + (enabled_port_mask & 1 << dst_port) == 0) + dst_port = portid; + + /* 02:00:00:00:00:xx */ + d_addr_bytes = ð_hdr->d_addr.addr_bytes[0]; + *((uint64_t *)d_addr_bytes) = + 0x000000000002 + ((uint64_t)dst_port << 40); + +#ifdef DO_RFC_1812_CHECKS + /* Update time to live and header checksum */ + --(ipv4_hdr->time_to_live); + ++(ipv4_hdr->hdr_checksum); +#endif + + /* src addr */ + ether_addr_copy(&ports_eth_addr[dst_port], ð_hdr->s_addr); + + send_single_packet(m, dst_port); + } + else { + /* Handle IPv6 headers.*/ +#if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) + struct ipv6_hdr *ipv6_hdr; + + ipv6_hdr = + (struct ipv6_hdr *)(rte_pktmbuf_mtod(m, unsigned char*) + + sizeof(struct ether_hdr)); + + dst_port = get_ipv6_dst_port(ipv6_hdr, portid, + qconf->ipv6_lookup_struct); + + if (dst_port >= RTE_MAX_ETHPORTS || + (enabled_port_mask & 1 << dst_port) == 0) + dst_port = portid; + + /* 02:00:00:00:00:xx */ + d_addr_bytes = ð_hdr->d_addr.addr_bytes[0]; + *((uint64_t *)d_addr_bytes) = + 0x000000000002 + ((uint64_t)dst_port << 40); + + /* src addr */ + ether_addr_copy(&ports_eth_addr[dst_port], ð_hdr->s_addr); + + send_single_packet(m, dst_port); +#else + /* We don't currently handle IPv6 packets in LPM mode. */ + rte_pktmbuf_free(m); +#endif + } + +} + +#define SLEEP_GEAR1_THRESHOLD 100 +#define SLEEP_GEAR2_THRESHOLD 1000 + +static inline uint32_t +power_idle_heuristic(uint32_t zero_rx_packet_count) +{ + /* If zero count is less than 100, use it as the sleep time in us */ + if (zero_rx_packet_count < SLEEP_GEAR1_THRESHOLD) + return zero_rx_packet_count; + /* If zero count is less than 1000, sleep time should be 100 us */ + else if ((zero_rx_packet_count >= SLEEP_GEAR1_THRESHOLD) && + (zero_rx_packet_count < SLEEP_GEAR2_THRESHOLD)) + return SLEEP_GEAR1_THRESHOLD; + /* If zero count is greater than 1000, sleep time should be 1000 us */ + else if (zero_rx_packet_count >= SLEEP_GEAR2_THRESHOLD) + return SLEEP_GEAR2_THRESHOLD; + + return 0; +} + +static inline enum freq_scale_hint_t +power_freq_scaleup_heuristic(unsigned lcore_id, uint32_t rx_ring_length) +{ +/** + * HW Rx queue size is 128 by default, Rx burst read at maximum 32 entries + * per iteration + */ +#define FREQ_GEAR1_RX_PACKET_THRESHOLD MAX_PKT_BURST +#define FREQ_GEAR2_RX_PACKET_THRESHOLD 64 +#define FREQ_GEAR3_RX_PACKET_THRESHOLD 96 +#define FREQ_UP_TREND1_ACC 1 +#define FREQ_UP_TREND2_ACC 100 +#define FREQ_UP_THRESHOLD 10000 + + /** + * there are received packets to process, staying at C0 state while + * trying to scale up frequency depending on how many entries on h/w + * queue. Determine frequency scaleup trend based on availiable entries + * on Rx queues. + */ + if (rx_ring_length > FREQ_GEAR3_RX_PACKET_THRESHOLD) { + stats[lcore_id].trend = 0; + return FREQ_HIGHEST; + } else if (rx_ring_length > FREQ_GEAR2_RX_PACKET_THRESHOLD) + stats[lcore_id].trend += FREQ_UP_TREND2_ACC; + else if (rx_ring_length > FREQ_GEAR1_RX_PACKET_THRESHOLD) + stats[lcore_id].trend += FREQ_UP_TREND1_ACC; + + if (stats[lcore_id].trend > FREQ_UP_THRESHOLD) { + stats[lcore_id].trend = 0; + return FREQ_HIGHER; + } + + return FREQ_CURRENT; +} + +/* main processing loop */ +static int +main_loop(__attribute__((unused)) void *dummy) +{ + struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; + unsigned lcore_id; + uint64_t prev_tsc, diff_tsc, cur_tsc; + uint64_t prev_tsc_power = 0, cur_tsc_power, diff_tsc_power; + int i, j, nb_rx; + uint8_t portid, queueid; + struct lcore_conf *qconf; + struct lcore_rx_queue *rx_queue; + uint32_t rx_ring_length; + enum freq_scale_hint_t lcore_scaleup_hint; + + uint32_t lcore_rx_idle_count = 0; + uint32_t lcore_idle_hint = 0; + + const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; + + prev_tsc = 0; + + lcore_id = rte_lcore_id(); + qconf = &lcore_conf[lcore_id]; + + if (qconf->n_rx_queue == 0) { + RTE_LOG(INFO, L3FWD_POWER, "lcore %u has nothing to do\n", lcore_id); + return 0; + } + + RTE_LOG(INFO, L3FWD_POWER, "entering main loop on lcore %u\n", lcore_id); + + for (i = 0; i < qconf->n_rx_queue; i++) { + + portid = qconf->rx_queue_list[i].port_id; + queueid = qconf->rx_queue_list[i].queue_id; + RTE_LOG(INFO, L3FWD_POWER, " -- lcoreid=%u portid=%hhu " + "rxqueueid=%hhu\n", lcore_id, portid, queueid); + } + + while (1) { + stats[lcore_id].nb_iteration_looped++; + + cur_tsc = rte_rdtsc(); + cur_tsc_power = cur_tsc; + + /* + * TX burst queue drain + */ + diff_tsc = cur_tsc - prev_tsc; + if (unlikely(diff_tsc > drain_tsc)) { + + /* + * This could be optimized (use queueid instead of + * portid), but it is not called so often + */ + for (portid = 0; portid < RTE_MAX_ETHPORTS; portid++) { + if (qconf->tx_mbufs[portid].len == 0) + continue; + send_burst(&lcore_conf[lcore_id], + qconf->tx_mbufs[portid].len, + portid); + qconf->tx_mbufs[portid].len = 0; + } + + prev_tsc = cur_tsc; + } + + diff_tsc_power = cur_tsc_power - prev_tsc_power; + if (diff_tsc_power > TIMER_RESOLUTION_CYCLES) { + rte_timer_manage(); + prev_tsc_power = cur_tsc_power; + } + + /* + * Read packet from RX queues + */ + lcore_scaleup_hint = FREQ_CURRENT; + lcore_rx_idle_count = 0; + for (i = 0; i < qconf->n_rx_queue; ++i) { + rx_queue = &(qconf->rx_queue_list[i]); + rx_queue->idle_hint = 0; + portid = rx_queue->port_id; + queueid = rx_queue->queue_id; + + nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst, + MAX_PKT_BURST); + stats[lcore_id].nb_rx_processed += nb_rx; + if (unlikely(nb_rx == 0)) { + /** + * no packet received from rx queue, try to + * sleep for a while forcing CPU enter deeper + * C states. + */ + rx_queue->zero_rx_packet_count++; + + if (rx_queue->zero_rx_packet_count <= + MIN_ZERO_POLL_COUNT) + continue; + + rx_queue->idle_hint = power_idle_heuristic(\ + rx_queue->zero_rx_packet_count); + lcore_rx_idle_count++; + } else { + /** + * get availiable descriptor number via MMIO read is costly, + * so only do it when recent poll returns maximum number. + */ + if (nb_rx >= MAX_PKT_BURST) + rx_ring_length = rte_eth_rx_queue_count(portid, queueid); + else + rx_ring_length = 0; + + rx_queue->zero_rx_packet_count = 0; + + /** + * do not scale up frequency immediately as + * user to kernel space communication is costly + * which might impact packet I/O for received + * packets. + */ + rx_queue->freq_up_hint = + power_freq_scaleup_heuristic(lcore_id, + rx_ring_length); + } + + /* Prefetch first packets */ + for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { + rte_prefetch0(rte_pktmbuf_mtod( + pkts_burst[j], void *)); + } + + /* Prefetch and forward already prefetched packets */ + for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { + rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ + j + PREFETCH_OFFSET], void *)); + l3fwd_simple_forward(pkts_burst[j], portid, + qconf); + } + + /* Forward remaining prefetched packets */ + for (; j < nb_rx; j++) { + l3fwd_simple_forward(pkts_burst[j], portid, + qconf); + } + } + + if (likely(lcore_rx_idle_count != qconf->n_rx_queue)) { + for (i = 1, lcore_scaleup_hint = + qconf->rx_queue_list[0].freq_up_hint; + i < qconf->n_rx_queue; ++i) { + rx_queue = &(qconf->rx_queue_list[i]); + if (rx_queue->freq_up_hint > + lcore_scaleup_hint) + lcore_scaleup_hint = + rx_queue->freq_up_hint; + } + + if (lcore_scaleup_hint == FREQ_HIGHEST) + rte_power_freq_max(lcore_id); + else if (lcore_scaleup_hint == FREQ_HIGHER) + rte_power_freq_up(lcore_id); + } else { + /** + * All Rx queues empty in recent consecutive polls, + * sleep in a conservative manner, meaning sleep as + * less as possible. + */ + for (i = 1, lcore_idle_hint = + qconf->rx_queue_list[0].idle_hint; + i < qconf->n_rx_queue; ++i) { + rx_queue = &(qconf->rx_queue_list[i]); + if (rx_queue->idle_hint < lcore_idle_hint) + lcore_idle_hint = rx_queue->idle_hint; + } + + if ( lcore_idle_hint < SLEEP_GEAR1_THRESHOLD) + /** + * execute "pause" instruction to avoid context + * switch for short sleep. + */ + rte_delay_us(lcore_idle_hint); + else + /* long sleep force runing thread to suspend */ + usleep(lcore_idle_hint); + + stats[lcore_id].sleep_time += lcore_idle_hint; + } + } +} + +static int +check_lcore_params(void) +{ + uint8_t queue, lcore; + uint16_t i; + int socketid; + + for (i = 0; i < nb_lcore_params; ++i) { + queue = lcore_params[i].queue_id; + if (queue >= MAX_RX_QUEUE_PER_PORT) { + printf("invalid queue number: %hhu\n", queue); + return -1; + } + lcore = lcore_params[i].lcore_id; + if (!rte_lcore_is_enabled(lcore)) { + printf("error: lcore %hhu is not enabled in lcore " + "mask\n", lcore); + return -1; + } + if ((socketid = rte_lcore_to_socket_id(lcore) != 0) && + (numa_on == 0)) { + printf("warning: lcore %hhu is on socket %d with numa " + "off\n", lcore, socketid); + } + } + return 0; +} + +static int +check_port_config(const unsigned nb_ports) +{ + unsigned portid; + uint16_t i; + + for (i = 0; i < nb_lcore_params; ++i) { + portid = lcore_params[i].port_id; + if ((enabled_port_mask & (1 << portid)) == 0) { + printf("port %u is not enabled in port mask\n", + portid); + return -1; + } + if (portid >= nb_ports) { + printf("port %u is not present on the board\n", + portid); + return -1; + } + } + return 0; +} + +static uint8_t +get_port_n_rx_queues(const uint8_t port) +{ + int queue = -1; + uint16_t i; + + for (i = 0; i < nb_lcore_params; ++i) { + if (lcore_params[i].port_id == port && + lcore_params[i].queue_id > queue) + queue = lcore_params[i].queue_id; + } + return (uint8_t)(++queue); +} + +static int +init_lcore_rx_queues(void) +{ + uint16_t i, nb_rx_queue; + uint8_t lcore; + + for (i = 0; i < nb_lcore_params; ++i) { + lcore = lcore_params[i].lcore_id; + nb_rx_queue = lcore_conf[lcore].n_rx_queue; + if (nb_rx_queue >= MAX_RX_QUEUE_PER_LCORE) { + printf("error: too many queues (%u) for lcore: %u\n", + (unsigned)nb_rx_queue + 1, (unsigned)lcore); + return -1; + } else { + lcore_conf[lcore].rx_queue_list[nb_rx_queue].port_id = + lcore_params[i].port_id; + lcore_conf[lcore].rx_queue_list[nb_rx_queue].queue_id = + lcore_params[i].queue_id; + lcore_conf[lcore].n_rx_queue++; + } + } + return 0; +} + +/* display usage */ +static void +print_usage(const char *prgname) +{ + printf ("%s [EAL options] -- -p PORTMASK -P" + " [--config (port,queue,lcore)[,(port,queue,lcore]]" + " [--enable-jumbo [--max-pkt-len PKTLEN]]\n" + " -p PORTMASK: hexadecimal bitmask of ports to configure\n" + " -P : enable promiscuous mode\n" + " --config (port,queue,lcore): rx queues configuration\n" + " --no-numa: optional, disable numa awareness\n" + " --enable-jumbo: enable jumbo frame" + " which max packet len is PKTLEN in decimal (64-9600)\n", + prgname); +} + +static int parse_max_pkt_len(const char *pktlen) +{ + char *end = NULL; + unsigned long len; + + /* parse decimal string */ + len = strtoul(pktlen, &end, 10); + if ((pktlen[0] == '\0') || (end == NULL) || (*end != '\0')) + return -1; + + if (len == 0) + return -1; + + return len; +} + +static int +parse_portmask(const char *portmask) +{ + char *end = NULL; + unsigned long pm; + + /* parse hexadecimal string */ + pm = strtoul(portmask, &end, 16); + if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0')) + return -1; + + if (pm == 0) + return -1; + + return pm; +} + +static int +parse_config(const char *q_arg) +{ + char s[256]; + const char *p, *p0 = q_arg; + char *end; + enum fieldnames { + FLD_PORT = 0, + FLD_QUEUE, + FLD_LCORE, + _NUM_FLD + }; + unsigned long int_fld[_NUM_FLD]; + char *str_fld[_NUM_FLD]; + int i; + unsigned size; + + nb_lcore_params = 0; + + while ((p = strchr(p0,'(')) != NULL) { + ++p; + if((p0 = strchr(p,')')) == NULL) + return -1; + + size = p0 - p; + if(size >= sizeof(s)) + return -1; + + rte_snprintf(s, sizeof(s), "%.*s", size, p); + if (rte_strsplit(s, sizeof(s), str_fld, _NUM_FLD, ',') != + _NUM_FLD) + return -1; + for (i = 0; i < _NUM_FLD; i++){ + errno = 0; + int_fld[i] = strtoul(str_fld[i], &end, 0); + if (errno != 0 || end == str_fld[i] || int_fld[i] > + 255) + return -1; + } + if (nb_lcore_params >= MAX_LCORE_PARAMS) { + printf("exceeded max number of lcore params: %hu\n", + nb_lcore_params); + return -1; + } + lcore_params_array[nb_lcore_params].port_id = + (uint8_t)int_fld[FLD_PORT]; + lcore_params_array[nb_lcore_params].queue_id = + (uint8_t)int_fld[FLD_QUEUE]; + lcore_params_array[nb_lcore_params].lcore_id = + (uint8_t)int_fld[FLD_LCORE]; + ++nb_lcore_params; + } + lcore_params = lcore_params_array; + + return 0; +} + +/* Parse the argument given in the command line of the application */ +static int +parse_args(int argc, char **argv) +{ + int opt, ret; + char **argvopt; + int option_index; + char *prgname = argv[0]; + static struct option lgopts[] = { + {"config", 1, 0, 0}, + {"no-numa", 0, 0, 0}, + {"enable-jumbo", 0, 0, 0}, + {NULL, 0, 0, 0} + }; + + argvopt = argv; + + while ((opt = getopt_long(argc, argvopt, "p:P", + lgopts, &option_index)) != EOF) { + + switch (opt) { + /* portmask */ + case 'p': + enabled_port_mask = parse_portmask(optarg); + if (enabled_port_mask == 0) { + printf("invalid portmask\n"); + print_usage(prgname); + return -1; + } + break; + case 'P': + printf("Promiscuous mode selected\n"); + promiscuous_on = 1; + break; + + /* long options */ + case 0: + if (!strncmp(lgopts[option_index].name, "config", 6)) { + ret = parse_config(optarg); + if (ret) { + printf("invalid config\n"); + print_usage(prgname); + return -1; + } + } + + if (!strncmp(lgopts[option_index].name, + "no-numa", 7)) { + printf("numa is disabled \n"); + numa_on = 0; + } + + if (!strncmp(lgopts[option_index].name, + "enable-jumbo", 12)) { + struct option lenopts = + {"max-pkt-len", required_argument, \ + 0, 0}; + + printf("jumbo frame is enabled \n"); + port_conf.rxmode.jumbo_frame = 1; + + /** + * if no max-pkt-len set, use the default value + * ETHER_MAX_LEN + */ + if (0 == getopt_long(argc, argvopt, "", + &lenopts, &option_index)) { + ret = parse_max_pkt_len(optarg); + if ((ret < 64) || + (ret > MAX_JUMBO_PKT_LEN)){ + printf("invalid packet " + "length\n"); + print_usage(prgname); + return -1; + } + port_conf.rxmode.max_rx_pkt_len = ret; + } + printf("set jumbo frame " + "max packet length to %u\n", + (unsigned int)port_conf.rxmode.max_rx_pkt_len); + } + + break; + + default: + print_usage(prgname); + return -1; + } + } + + if (optind >= 0) + argv[optind-1] = prgname; + + ret = optind-1; + optind = 0; /* reset getopt lib */ + return ret; +} + +static void +print_ethaddr(const char *name, const struct ether_addr *eth_addr) +{ + printf ("%s%02X:%02X:%02X:%02X:%02X:%02X", name, + eth_addr->addr_bytes[0], + eth_addr->addr_bytes[1], + eth_addr->addr_bytes[2], + eth_addr->addr_bytes[3], + eth_addr->addr_bytes[4], + eth_addr->addr_bytes[5]); +} + +#if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) +static void +setup_hash(int socketid) +{ + struct rte_hash_parameters ipv4_l3fwd_hash_params = { + .name = NULL, + .entries = L3FWD_HASH_ENTRIES, + .bucket_entries = 4, + .key_len = sizeof(struct ipv4_5tuple), + .hash_func = DEFAULT_HASH_FUNC, + .hash_func_init_val = 0, + }; + + struct rte_hash_parameters ipv6_l3fwd_hash_params = { + .name = NULL, + .entries = L3FWD_HASH_ENTRIES, + .bucket_entries = 4, + .key_len = sizeof(struct ipv6_5tuple), + .hash_func = DEFAULT_HASH_FUNC, + .hash_func_init_val = 0, + }; + + unsigned i; + int ret; + char s[64]; + + /* create ipv4 hash */ + rte_snprintf(s, sizeof(s), "ipv4_l3fwd_hash_%d", socketid); + ipv4_l3fwd_hash_params.name = s; + ipv4_l3fwd_hash_params.socket_id = socketid; + ipv4_l3fwd_lookup_struct[socketid] = + rte_hash_create(&ipv4_l3fwd_hash_params); + if (ipv4_l3fwd_lookup_struct[socketid] == NULL) + rte_exit(EXIT_FAILURE, "Unable to create the l3fwd hash on " + "socket %d\n", socketid); + + /* create ipv6 hash */ + rte_snprintf(s, sizeof(s), "ipv6_l3fwd_hash_%d", socketid); + ipv6_l3fwd_hash_params.name = s; + ipv6_l3fwd_hash_params.socket_id = socketid; + ipv6_l3fwd_lookup_struct[socketid] = + rte_hash_create(&ipv6_l3fwd_hash_params); + if (ipv6_l3fwd_lookup_struct[socketid] == NULL) + rte_exit(EXIT_FAILURE, "Unable to create the l3fwd hash on " + "socket %d\n", socketid); + + + /* populate the ipv4 hash */ + for (i = 0; i < IPV4_L3FWD_NUM_ROUTES; i++) { + ret = rte_hash_add_key (ipv4_l3fwd_lookup_struct[socketid], + (void *) &ipv4_l3fwd_route_array[i].key); + if (ret < 0) { + rte_exit(EXIT_FAILURE, "Unable to add entry %u to the" + "l3fwd hash on socket %d\n", i, socketid); + } + ipv4_l3fwd_out_if[ret] = ipv4_l3fwd_route_array[i].if_out; + printf("Hash: Adding key\n"); + print_ipv4_key(ipv4_l3fwd_route_array[i].key); + } + + /* populate the ipv6 hash */ + for (i = 0; i < IPV6_L3FWD_NUM_ROUTES; i++) { + ret = rte_hash_add_key (ipv6_l3fwd_lookup_struct[socketid], + (void *) &ipv6_l3fwd_route_array[i].key); + if (ret < 0) { + rte_exit(EXIT_FAILURE, "Unable to add entry %u to the" + "l3fwd hash on socket %d\n", i, socketid); + } + ipv6_l3fwd_out_if[ret] = ipv6_l3fwd_route_array[i].if_out; + printf("Hash: Adding key\n"); + print_ipv6_key(ipv6_l3fwd_route_array[i].key); + } +} +#endif + +#if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) +static void +setup_lpm(int socketid) +{ + unsigned i; + int ret; + char s[64]; + + /* create the LPM table */ + rte_snprintf(s, sizeof(s), "IPV4_L3FWD_LPM_%d", socketid); + ipv4_l3fwd_lookup_struct[socketid] = rte_lpm_create(s, socketid, + IPV4_L3FWD_LPM_MAX_RULES, 0); + if (ipv4_l3fwd_lookup_struct[socketid] == NULL) + rte_exit(EXIT_FAILURE, "Unable to create the l3fwd LPM table" + " on socket %d\n", socketid); + + /* populate the LPM table */ + for (i = 0; i < IPV4_L3FWD_NUM_ROUTES; i++) { + ret = rte_lpm_add(ipv4_l3fwd_lookup_struct[socketid], + ipv4_l3fwd_route_array[i].ip, + ipv4_l3fwd_route_array[i].depth, + ipv4_l3fwd_route_array[i].if_out); + + if (ret < 0) { + rte_exit(EXIT_FAILURE, "Unable to add entry %u to the " + "l3fwd LPM table on socket %d\n", + i, socketid); + } + + printf("LPM: Adding route 0x%08x / %d (%d)\n", + (unsigned)ipv4_l3fwd_route_array[i].ip, + ipv4_l3fwd_route_array[i].depth, + ipv4_l3fwd_route_array[i].if_out); + } +} +#endif + +static int +init_mem(unsigned nb_mbuf) +{ + struct lcore_conf *qconf; + int socketid; + unsigned lcore_id; + char s[64]; + + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { + if (rte_lcore_is_enabled(lcore_id) == 0) + continue; + + if (numa_on) + socketid = rte_lcore_to_socket_id(lcore_id); + else + socketid = 0; + + if (socketid >= NB_SOCKETS) { + rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is " + "out of range %d\n", socketid, + lcore_id, NB_SOCKETS); + } + if (pktmbuf_pool[socketid] == NULL) { + rte_snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); + pktmbuf_pool[socketid] = + rte_mempool_create(s, nb_mbuf, + MBUF_SIZE, MEMPOOL_CACHE_SIZE, + sizeof(struct rte_pktmbuf_pool_private), + rte_pktmbuf_pool_init, NULL, + rte_pktmbuf_init, NULL, + socketid, 0); + if (pktmbuf_pool[socketid] == NULL) + rte_exit(EXIT_FAILURE, + "Cannot init mbuf pool on socket %d\n", + socketid); + else + printf("Allocated mbuf pool on socket %d\n", + socketid); + +#if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) + setup_lpm(socketid); +#else + setup_hash(socketid); +#endif + } + qconf = &lcore_conf[lcore_id]; + qconf->ipv4_lookup_struct = ipv4_l3fwd_lookup_struct[socketid]; +#if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) + qconf->ipv6_lookup_struct = ipv6_l3fwd_lookup_struct[socketid]; +#endif + } + return 0; +} + +/* Check the link status of all ports in up to 9s, and print them finally */ +static void +check_all_ports_link_status(uint8_t port_num, uint32_t port_mask) +{ +#define CHECK_INTERVAL 100 /* 100ms */ +#define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ + uint8_t portid, count, all_ports_up, print_flag = 0; + struct rte_eth_link link; + + printf("\nChecking link status"); + fflush(stdout); + for (count = 0; count <= MAX_CHECK_TIME; count++) { + all_ports_up = 1; + for (portid = 0; portid < port_num; portid++) { + if ((port_mask & (1 << portid)) == 0) + continue; + memset(&link, 0, sizeof(link)); + rte_eth_link_get_nowait(portid, &link); + /* print link status if flag set */ + if (print_flag == 1) { + if (link.link_status) + printf("Port %d Link Up - speed %u " + "Mbps - %s\n", (uint8_t)portid, + (unsigned)link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? + ("full-duplex") : ("half-duplex\n")); + else + printf("Port %d Link Down\n", + (uint8_t)portid); + continue; + } + /* clear all_ports_up flag if any link down */ + if (link.link_status == 0) { + all_ports_up = 0; + break; + } + } + /* after finally printing all link status, get out */ + if (print_flag == 1) + break; + + if (all_ports_up == 0) { + printf("."); + fflush(stdout); + rte_delay_ms(CHECK_INTERVAL); + } + + /* set the print_flag if all ports up or timeout */ + if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { + print_flag = 1; + printf("done\n"); + } + } +} + +int +MAIN(int argc, char **argv) +{ + struct lcore_conf *qconf; + int ret; + unsigned nb_ports; + uint16_t queueid; + unsigned lcore_id; + uint64_t hz; + uint32_t n_tx_queue, nb_lcores; + uint8_t portid, nb_rx_queue, queue, socketid; + + /* catch SIGINT and restore cpufreq governor to ondemand */ + signal(SIGINT, signal_exit_now); + + /* init EAL */ + ret = rte_eal_init(argc, argv); + if (ret < 0) + rte_exit(EXIT_FAILURE, "Invalid EAL parameters\n"); + argc -= ret; + argv += ret; + + /* init RTE timer library to be used late */ + rte_timer_subsystem_init(); + + /* parse application arguments (after the EAL ones) */ + ret = parse_args(argc, argv); + if (ret < 0) + rte_exit(EXIT_FAILURE, "Invalid L3FWD parameters\n"); + + if (check_lcore_params() < 0) + rte_exit(EXIT_FAILURE, "check_lcore_params failed\n"); + + ret = init_lcore_rx_queues(); + if (ret < 0) + rte_exit(EXIT_FAILURE, "init_lcore_rx_queues failed\n"); + + + /* init driver(s) */ + if (rte_pmd_init_all() < 0) + rte_exit(EXIT_FAILURE, "Cannot init pmd\n"); + + if (rte_eal_pci_probe() < 0) + rte_exit(EXIT_FAILURE, "Cannot probe PCI\n"); + + nb_ports = rte_eth_dev_count(); + if (nb_ports > RTE_MAX_ETHPORTS) + nb_ports = RTE_MAX_ETHPORTS; + + if (check_port_config(nb_ports) < 0) + rte_exit(EXIT_FAILURE, "check_port_config failed\n"); + + nb_lcores = rte_lcore_count(); + + /* initialize all ports */ + for (portid = 0; portid < nb_ports; portid++) { + /* skip ports that are not enabled */ + if ((enabled_port_mask & (1 << portid)) == 0) { + printf("\nSkipping disabled port %d\n", portid); + continue; + } + + /* init port */ + printf("Initializing port %d ... ", portid ); + fflush(stdout); + + nb_rx_queue = get_port_n_rx_queues(portid); + n_tx_queue = nb_lcores; + if (n_tx_queue > MAX_TX_QUEUE_PER_PORT) + n_tx_queue = MAX_TX_QUEUE_PER_PORT; + printf("Creating queues: nb_rxq=%d nb_txq=%u... ", + nb_rx_queue, (unsigned)n_tx_queue ); + ret = rte_eth_dev_configure(portid, nb_rx_queue, + (uint16_t)n_tx_queue, &port_conf); + if (ret < 0) + rte_exit(EXIT_FAILURE, "Cannot configure device: " + "err=%d, port=%d\n", ret, portid); + + rte_eth_macaddr_get(portid, &ports_eth_addr[portid]); + print_ethaddr(" Address:", &ports_eth_addr[portid]); + printf(", "); + + /* init memory */ + ret = init_mem(NB_MBUF); + if (ret < 0) + rte_exit(EXIT_FAILURE, "init_mem failed\n"); + + /* init one TX queue per couple (lcore,port) */ + queueid = 0; + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { + if (rte_lcore_is_enabled(lcore_id) == 0) + continue; + + if (numa_on) + socketid = \ + (uint8_t)rte_lcore_to_socket_id(lcore_id); + else + socketid = 0; + + printf("txq=%u,%d,%d ", lcore_id, queueid, socketid); + fflush(stdout); + ret = rte_eth_tx_queue_setup(portid, queueid, nb_txd, + socketid, &tx_conf); + if (ret < 0) + rte_exit(EXIT_FAILURE, + "rte_eth_tx_queue_setup: err=%d, " + "port=%d\n", ret, portid); + + qconf = &lcore_conf[lcore_id]; + qconf->tx_queue_id[portid] = queueid; + queueid++; + } + printf("\n"); + } + + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { + if (rte_lcore_is_enabled(lcore_id) == 0) + continue; + + /* init power management library */ + ret = rte_power_init(lcore_id); + if (ret) + rte_exit(EXIT_FAILURE, "Power management library " + "initialization failed on core%u\n", lcore_id); + + /* init timer structures for each enabled lcore */ + rte_timer_init(&power_timers[lcore_id]); + hz = rte_get_timer_hz(); + rte_timer_reset(&power_timers[lcore_id], + hz/TIMER_NUMBER_PER_SECOND, SINGLE, lcore_id, + power_timer_cb, NULL); + + qconf = &lcore_conf[lcore_id]; + printf("\nInitializing rx queues on lcore %u ... ", lcore_id ); + fflush(stdout); + /* init RX queues */ + for(queue = 0; queue < qconf->n_rx_queue; ++queue) { + portid = qconf->rx_queue_list[queue].port_id; + queueid = qconf->rx_queue_list[queue].queue_id; + + if (numa_on) + socketid = \ + (uint8_t)rte_lcore_to_socket_id(lcore_id); + else + socketid = 0; + + printf("rxq=%d,%d,%d ", portid, queueid, socketid); + fflush(stdout); + + ret = rte_eth_rx_queue_setup(portid, queueid, nb_rxd, + socketid, &rx_conf, pktmbuf_pool[socketid]); + if (ret < 0) + rte_exit(EXIT_FAILURE, + "rte_eth_rx_queue_setup: err=%d, " + "port=%d\n", ret, portid); + } + } + + printf("\n"); + + /* start ports */ + for (portid = 0; portid < nb_ports; portid++) { + if ((enabled_port_mask & (1 << portid)) == 0) { + continue; + } + /* Start device */ + ret = rte_eth_dev_start(portid); + if (ret < 0) + rte_exit(EXIT_FAILURE, "rte_eth_dev_start: err=%d, " + "port=%d\n", ret, portid); + + /* + * If enabled, put device in promiscuous mode. + * This allows IO forwarding mode to forward packets + * to itself through 2 cross-connected ports of the + * target machine. + */ + if (promiscuous_on) + rte_eth_promiscuous_enable(portid); + } + + check_all_ports_link_status((uint8_t)nb_ports, enabled_port_mask); + + /* launch per-lcore init on every lcore */ + rte_eal_mp_remote_launch(main_loop, NULL, CALL_MASTER); + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + if (rte_eal_wait_lcore(lcore_id) < 0) + return -1; + } + + return 0; +} diff --git a/examples/l3fwd-power/main.h b/examples/l3fwd-power/main.h new file mode 100644 index 0000000000..d7ee4e4b93 --- /dev/null +++ b/examples/l3fwd-power/main.h @@ -0,0 +1,46 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2013 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _MAIN_H_ +#define _MAIN_H_ + +#ifdef RTE_EXEC_ENV_BAREMETAL +#define MAIN _main +#else +#define MAIN main +#endif + +int MAIN(int argc, char **argv); + +#endif /* _MAIN_H_ */ diff --git a/lib/Makefile b/lib/Makefile index c2594bd334..82ed571e97 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -46,6 +46,7 @@ DIRS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += librte_pmd_ixgbe DIRS-$(CONFIG_RTE_LIBRTE_HASH) += librte_hash DIRS-$(CONFIG_RTE_LIBRTE_LPM) += librte_lpm DIRS-$(CONFIG_RTE_LIBRTE_NET) += librte_net +DIRS-$(CONFIG_RTE_LIBRTE_POWER) += librte_power DIRS-$(CONFIG_RTE_LIBRTE_PMAC) += librte_pmac ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),y) diff --git a/lib/librte_eal/common/include/rte_log.h b/lib/librte_eal/common/include/rte_log.h index 5f568c8177..5e886c09dc 100644 --- a/lib/librte_eal/common/include/rte_log.h +++ b/lib/librte_eal/common/include/rte_log.h @@ -72,6 +72,7 @@ extern struct rte_logs rte_logs; #define RTE_LOGTYPE_LPM 0x00000080 /**< Log related to LPM. */ #define RTE_LOGTYPE_KNI 0X00000100 /**< Log related to KNI. */ #define RTE_LOGTYPE_PMAC 0x00000200 /**< Log related to PMAC. */ +#define RTE_LOGTYPE_POWER 0x00000400 /**< Log related to power. */ /* these log types can be used in an application */ #define RTE_LOGTYPE_USER1 0x01000000 /**< User-defined log type 1. */ diff --git a/lib/librte_power/Makefile b/lib/librte_power/Makefile new file mode 100644 index 0000000000..6986e029cd --- /dev/null +++ b/lib/librte_power/Makefile @@ -0,0 +1,49 @@ +# BSD LICENSE +# +# Copyright(c) 2010-2013 Intel Corporation. All rights reserved. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_power.a + +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -fno-strict-aliasing + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_POWER) := rte_power.c + +# install this header file +SYMLINK-$(CONFIG_RTE_LIBRTE_POWER)-include := rte_power.h + +# this lib needs eal +DEPDIRS-$(CONFIG_RTE_LIBRTE_POWER) += lib/librte_eal + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_power/rte_power.c b/lib/librte_power/rte_power.c new file mode 100644 index 0000000000..8370b96fcf --- /dev/null +++ b/lib/librte_power/rte_power.c @@ -0,0 +1,546 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2013 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "rte_power.h" + +#ifdef RTE_LIBRTE_POWER_DEBUG +#define POWER_DEBUG_TRACE(fmt, args...) do { \ + RTE_LOG(ERR, POWER, "%s: " fmt, __func__, ## args); \ + } while (0) +#else +#define POWER_DEBUG_TRACE(fmt, args...) +#endif + +#define FOPEN_OR_ERR_RET(f, retval) do { \ + if ((f) == NULL) { \ + RTE_LOG(ERR, POWER, "File not openned\n"); \ + return (retval); \ + } \ +} while(0) + +#define FOPS_OR_NULL_GOTO(ret, label) do { \ + if ((ret) == NULL) { \ + RTE_LOG(ERR, POWER, "fgets returns nothing\n"); \ + goto label; \ + } \ +} while(0) + +#define FOPS_OR_ERR_GOTO(ret, label) do { \ + if ((ret) < 0) { \ + RTE_LOG(ERR, POWER, "File operations failed\n"); \ + goto label; \ + } \ +} while(0) + +#define STR_SIZE 1024 +#define POWER_CONVERT_TO_DECIMAL 10 + +#define POWER_GOVERNOR_USERSPACE "userspace" +#define POWER_SYSFILE_GOVERNOR \ + "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_governor" +#define POWER_SYSFILE_AVAIL_FREQ \ + "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_available_frequencies" +#define POWER_SYSFILE_SETSPEED \ + "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_setspeed" + +enum power_state { + POWER_IDLE = 0, + POWER_ONGOING, + POWER_USED, + POWER_UNKNOWN +}; + +/** + * Power info per lcore. + */ +struct rte_power_info { + unsigned lcore_id; /**< Logical core id */ + uint32_t freqs[RTE_MAX_LCORE_FREQS]; /**< Frequency array */ + uint32_t nb_freqs; /**< number of available freqs */ + FILE *f; /**< FD of scaling_setspeed */ + char governor_ori[32]; /**< Original governor name */ + uint32_t curr_idx; /**< Freq index in freqs array */ + volatile uint32_t state; /**< Power in use state */ +} __rte_cache_aligned; + +static struct rte_power_info lcore_power_info[RTE_MAX_LCORE]; + +/** + * It is to set specific freq for specific logical core, according to the index + * of supported frequencies. + */ +static int +set_freq_internal(struct rte_power_info *pi, uint32_t idx) +{ + if (idx >= RTE_MAX_LCORE_FREQS || idx >= pi->nb_freqs) { + RTE_LOG(ERR, POWER, "Invalid frequency index %u, which " + "should be less than %u\n", idx, pi->nb_freqs); + return -1; + } + + /* Check if it is the same as current */ + if (idx == pi->curr_idx) + return 0; + + POWER_DEBUG_TRACE("Freqency[%u] %u to be set for lcore %u\n", + idx, pi->freqs[idx], pi->lcore_id); + if (fseek(pi->f, 0, SEEK_SET) < 0) { + RTE_LOG(ERR, POWER, "Fail to set file position indicator to 0 " + "for setting frequency for lcore %u\n", pi->lcore_id); + return -1; + } + if (fprintf(pi->f, "%u", pi->freqs[idx]) < 0) { + RTE_LOG(ERR, POWER, "Fail to write new frequency for " + "lcore %u\n", pi->lcore_id); + return -1; + } + fflush(pi->f); + pi->curr_idx = idx; + + return 1; +} + +/** + * It is to check the current scaling governor by reading sys file, and then + * set it into 'userspace' if it is not by writing the sys file. The original + * governor will be saved for rolling back. + */ +static int +power_set_governor_userspace(struct rte_power_info *pi) +{ + FILE *f; + int ret = -1; + char buf[BUFSIZ]; + char fullpath[PATH_MAX]; + char *s; + int val; + + rte_snprintf(fullpath, sizeof(fullpath), POWER_SYSFILE_GOVERNOR, + pi->lcore_id); + f = fopen(fullpath, "rw+"); + FOPEN_OR_ERR_RET(f, ret); + + s = fgets(buf, sizeof(buf), f); + FOPS_OR_NULL_GOTO(s, out); + + /* Check if current governor is userspace */ + if (strncmp(buf, POWER_GOVERNOR_USERSPACE, + sizeof(POWER_GOVERNOR_USERSPACE)) == 0) { + ret = 0; + POWER_DEBUG_TRACE("Power management governor of lcore %u is " + "already userspace\n", pi->lcore_id); + goto out; + } + /* Save the original governor */ + rte_snprintf(pi->governor_ori, sizeof(pi->governor_ori), "%s", buf); + + /* Write 'userspace' to the governor */ + val = fseek(f, 0, SEEK_SET); + FOPS_OR_ERR_GOTO(val, out); + + val = fputs(POWER_GOVERNOR_USERSPACE, f); + FOPS_OR_ERR_GOTO(val, out); + + ret = 0; + RTE_LOG(INFO, POWER, "Power management governor of lcore %u has been " + "set to user space successfully\n", pi->lcore_id); +out: + fclose(f); + + return ret; +} + +/** + * It is to get the available frequencies of the specific lcore by reading the + * sys file. + */ +static int +power_get_available_freqs(struct rte_power_info *pi) +{ + FILE *f; + int ret = -1, i, count; + char *p; + char buf[BUFSIZ]; + char fullpath[PATH_MAX]; + char *freqs[RTE_MAX_LCORE_FREQS]; + char *s; + + rte_snprintf(fullpath, sizeof(fullpath), POWER_SYSFILE_AVAIL_FREQ, + pi->lcore_id); + f = fopen(fullpath, "r"); + FOPEN_OR_ERR_RET(f, ret); + + s = fgets(buf, sizeof(buf), f); + FOPS_OR_NULL_GOTO(s, out); + + /* Strip the line break if there is */ + p = strchr(buf, '\n'); + if (p != NULL) + *p = 0; + + /* Split string into at most RTE_MAX_LCORE_FREQS frequencies */ + count = rte_strsplit(buf, sizeof(buf), freqs, + RTE_MAX_LCORE_FREQS, ' '); + if (count <= 0) { + RTE_LOG(ERR, POWER, "No available frequency in " + ""POWER_SYSFILE_AVAIL_FREQ"\n", pi->lcore_id); + goto out; + } + if (count >= RTE_MAX_LCORE_FREQS) { + RTE_LOG(ERR, POWER, "Too many available frequencies : %d\n", + count); + goto out; + } + + /* Store the available frequncies into power context */ + for (i = 0, pi->nb_freqs = 0; i < count; i++) { + POWER_DEBUG_TRACE("Lcore %u frequency[%d]: %s\n", pi->lcore_id, + i, freqs[i]); + pi->freqs[pi->nb_freqs++] = strtoul(freqs[i], &p, + POWER_CONVERT_TO_DECIMAL); + } + + ret = 0; + POWER_DEBUG_TRACE("%d frequencie(s) of lcore %u are available\n", + count, pi->lcore_id); +out: + fclose(f); + + return ret; +} + +/** + * It is to fopen the sys file for the future setting the lcore frequency. + */ +static int +power_init_for_setting_freq(struct rte_power_info *pi) +{ + FILE *f; + char fullpath[PATH_MAX]; + char buf[BUFSIZ]; + uint32_t i, freq; + char *s; + + rte_snprintf(fullpath, sizeof(fullpath), POWER_SYSFILE_SETSPEED, + pi->lcore_id); + f = fopen(fullpath, "rw+"); + FOPEN_OR_ERR_RET(f, -1); + + s = fgets(buf, sizeof(buf), f); + FOPS_OR_NULL_GOTO(s, out); + + freq = strtoul(buf, NULL, POWER_CONVERT_TO_DECIMAL); + for (i = 0; i < pi->nb_freqs; i++) { + if (freq == pi->freqs[i]) { + pi->curr_idx = i; + pi->f = f; + return 0; + } + } + +out: + fclose(f); + + return -1; +} + +int +rte_power_init(unsigned lcore_id) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Lcore id %u can not exceeds %u\n", + lcore_id, RTE_MAX_LCORE - 1U); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + if (rte_atomic32_cmpset(&(pi->state), POWER_IDLE, POWER_ONGOING) + == 0) { + RTE_LOG(INFO, POWER, "Power management of lcore %u is " + "in use\n", lcore_id); + return -1; + } + + pi->lcore_id = lcore_id; + /* Check and set the governor */ + if (power_set_governor_userspace(pi) < 0) { + RTE_LOG(ERR, POWER, "Cannot set governor of lcore %u to " + "userspace\n", lcore_id); + goto fail; + } + + /* Get the available frequencies */ + if (power_get_available_freqs(pi) < 0) { + RTE_LOG(ERR, POWER, "Cannot get available frequencies of " + "lcore %u\n", lcore_id); + goto fail; + } + + /* Init for setting lcore frequency */ + if (power_init_for_setting_freq(pi) < 0) { + RTE_LOG(ERR, POWER, "Cannot init for setting frequency for " + "lcore %u\n", lcore_id); + goto fail; + } + + /* Set freq to max by default */ + if (rte_power_freq_max(lcore_id) < 0) { + RTE_LOG(ERR, POWER, "Cannot set frequency of lcore %u " + "to max\n", lcore_id); + goto fail; + } + + RTE_LOG(INFO, POWER, "Initialized successfully for lcore %u " + "power manamgement\n", lcore_id); + rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_USED); + + return 0; + +fail: + rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_UNKNOWN); + + return -1; +} + +/** + * It is to check the governor and then set the original governor back if + * needed by writing the the sys file. + */ +static int +power_set_governor_original(struct rte_power_info *pi) +{ + FILE *f; + int ret = -1; + char buf[BUFSIZ]; + char fullpath[PATH_MAX]; + char *s; + int val; + + rte_snprintf(fullpath, sizeof(fullpath), POWER_SYSFILE_GOVERNOR, + pi->lcore_id); + f = fopen(fullpath, "rw+"); + FOPEN_OR_ERR_RET(f, ret); + + s = fgets(buf, sizeof(buf), f); + FOPS_OR_NULL_GOTO(s, out); + + /* Check if the governor to be set is the same as current */ + if (strncmp(buf, pi->governor_ori, sizeof(pi->governor_ori)) == 0) { + ret = 0; + POWER_DEBUG_TRACE("Power management governor of lcore %u " + "has already been set to %s\n", + pi->lcore_id, pi->governor_ori); + goto out; + } + + /* Write back the original governor */ + val = fseek(f, 0, SEEK_SET); + FOPS_OR_ERR_GOTO(val, out); + + val = fputs(pi->governor_ori, f); + FOPS_OR_ERR_GOTO(val, out); + + ret = 0; + RTE_LOG(INFO, POWER, "Power manamgement governor of lcore %u " + "has been set back to %s successfully\n", + pi->lcore_id, pi->governor_ori); +out: + fclose(f); + + return ret; +} + +int +rte_power_exit(unsigned lcore_id) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Lcore id %u can not exceeds %u\n", + lcore_id, RTE_MAX_LCORE - 1U); + return -1; + } + pi = &lcore_power_info[lcore_id]; + if (rte_atomic32_cmpset(&(pi->state), POWER_USED, POWER_ONGOING) + == 0) { + RTE_LOG(INFO, POWER, "Power management of lcore %u is " + "not used\n", lcore_id); + return -1; + } + + /* Close FD of setting freq */ + fclose(pi->f); + pi->f = NULL; + + /* Set the governor back to the original */ + if (power_set_governor_original(pi) < 0) { + RTE_LOG(ERR, POWER, "Cannot set the governor of %u back " + "to the original\n", lcore_id); + goto fail; + } + + RTE_LOG(INFO, POWER, "Power management of lcore %u has exited from " + "'userspace' mode and been set back to the " + "original\n", lcore_id); + rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_IDLE); + + return 0; + +fail: + rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_UNKNOWN); + + return -1; +} + +uint32_t +rte_power_freqs(unsigned lcore_id, uint32_t *freqs, uint32_t num) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE || !freqs) { + RTE_LOG(ERR, POWER, "Invalid input parameter\n"); + return 0; + } + + pi = &lcore_power_info[lcore_id]; + if (num < pi->nb_freqs) { + RTE_LOG(ERR, POWER, "Buffer size is not enough\n"); + return 0; + } + rte_memcpy(freqs, pi->freqs, pi->nb_freqs * sizeof(uint32_t)); + + return pi->nb_freqs; +} + +uint32_t +rte_power_get_freq(unsigned lcore_id) +{ + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return RTE_POWER_INVALID_FREQ_INDEX; + } + + return lcore_power_info[lcore_id].curr_idx; +} + +int +rte_power_set_freq(unsigned lcore_id, uint32_t index) +{ + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + return set_freq_internal(&(lcore_power_info[lcore_id]), index); +} + +int +rte_power_freq_down(unsigned lcore_id) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + if (pi->curr_idx + 1 == pi->nb_freqs) + return 0; + + /* Frequencies in the array are from high to low. */ + return set_freq_internal(pi, pi->curr_idx + 1); +} + +int +rte_power_freq_up(unsigned lcore_id) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + if (pi->curr_idx == 0) + return 0; + + /* Frequencies in the array are from high to low. */ + return set_freq_internal(pi, pi->curr_idx - 1); +} + +int +rte_power_freq_max(unsigned lcore_id) +{ + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + /* Frequencies in the array are from high to low. */ + return set_freq_internal(&lcore_power_info[lcore_id], 0); +} + +int +rte_power_freq_min(unsigned lcore_id) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + + /* Frequencies in the array are from high to low. */ + return set_freq_internal(pi, pi->nb_freqs - 1); +} + diff --git a/lib/librte_power/rte_power.h b/lib/librte_power/rte_power.h new file mode 100644 index 0000000000..16f9bc2658 --- /dev/null +++ b/lib/librte_power/rte_power.h @@ -0,0 +1,194 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2013 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _RTE_POWER_H +#define _RTE_POWER_H + +/** + * @file + * RTE Power Management + */ + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define RTE_POWER_INVALID_FREQ_INDEX (~0) + +/** + * Initialize power management for a specific lcore. It will check and set the + * governor to userspace for the lcore, get the available frequencies, and + * prepare to set new lcore frequency. + * + * @param lcore_id + * lcore id. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int rte_power_init(unsigned lcore_id); + +/** + * Exit power management on a specific lcore. It will set the governor to which + * is before initialized. + * + * @param lcore_id + * lcore id. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int rte_power_exit(unsigned lcore_id); + +/** + * Get the available frequencies of a specific lcore. The return value will be + * the minimal one of the total number of available frequencies and the number + * of buffer. The index of available frequencies used in other interfaces + * should be in the range of 0 to this return value. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * @param freqs + * The buffer array to save the frequencies. + * @param num + * The number of frequencies to get. + * + * @return + * The number of available frequencies. + */ +uint32_t rte_power_freqs(unsigned lcore_id, uint32_t *freqs, uint32_t num); + +/** + * Return the current index of available frequencies of a specific lcore. It + * will return 'RTE_POWER_INVALID_FREQ_INDEX = (~0)' if error. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * The current index of available frequencies. + */ +uint32_t rte_power_get_freq(unsigned lcore_id); + +/** + * Set the new frequency for a specific lcore by indicating the index of + * available frequencies. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * @param index + * The index of available frequencies. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency chnaged. + * - Negative on error. + */ +int rte_power_set_freq(unsigned lcore_id, uint32_t index); + +/** + * Scale up the frequency of a specific lcore according to the available + * frequencies. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency chnaged. + * - Negative on error. + */ +int rte_power_freq_up(unsigned lcore_id); + +/** + * Scale down the frequency of a specific lcore according to the available + * frequencies. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency chnaged. + * - Negative on error. + */ +int rte_power_freq_down(unsigned lcore_id); + +/** + * Scale up the frequency of a specific lcore to the highest according to the + * available frequencies. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency chnaged. + * - Negative on error. + */ +int rte_power_freq_max(unsigned lcore_id); + +/** + * Scale down the frequency of a specific lcore to the lowest according to the + * available frequencies. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency chnaged. + * - Negative on error. + */ +int rte_power_freq_min(unsigned lcore_id); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/mk/rte.app.mk b/mk/rte.app.mk index b192a92139..f8f265741d 100644 --- a/mk/rte.app.mk +++ b/mk/rte.app.mk @@ -93,6 +93,10 @@ ifeq ($(CONFIG_RTE_LIBRTE_LPM),y) LDLIBS += -lrte_lpm endif +ifeq ($(CONFIG_RTE_LIBRTE_POWER),y) +LDLIBS += -lrte_power +endif + ifeq ($(CONFIG_RTE_LIBRTE_PMAC),y) LDLIBS += -lrte_pmac endif