X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=lib%2Fpower%2Frte_power_pmd_mgmt.c;h=39a2b4cd23a2a23ae73aadbd2e8da595d7707027;hb=06c047b680615678bb8773faaec53156ade94770;hp=db03cbf420e9e560a44ef11c772da1d8a2807e2f;hpb=99a2dd955fba6e4cc23b77d590a033650ced9c45;p=dpdk.git

diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c
index db03cbf420..39a2b4cd23 100644
--- a/lib/power/rte_power_pmd_mgmt.c
+++ b/lib/power/rte_power_pmd_mgmt.c
@@ -33,20 +33,124 @@ enum pmd_mgmt_state {
 	PMD_MGMT_ENABLED
 };
 
-struct pmd_queue_cfg {
+union queue {
+	uint32_t val;
+	struct {
+		uint16_t portid;
+		uint16_t qid;
+	};
+};
+
+struct queue_list_entry {
+	TAILQ_ENTRY(queue_list_entry) next;
+	union queue queue;
+	uint64_t n_empty_polls;
+	uint64_t n_sleeps;
+	const struct rte_eth_rxtx_callback *cb;
+};
+
+struct pmd_core_cfg {
+	TAILQ_HEAD(queue_list_head, queue_list_entry) head;
+	/**< List of queues associated with this lcore */
+	size_t n_queues;
+	/**< How many queues are in the list? */
 	volatile enum pmd_mgmt_state pwr_mgmt_state;
 	/**< State of power management for this queue */
 	enum rte_power_pmd_mgmt_type cb_mode;
 	/**< Callback mode for this queue */
-	const struct rte_eth_rxtx_callback *cur_cb;
-	/**< Callback instance */
-	volatile bool umwait_in_progress;
-	/**< are we currently sleeping? */
-	uint64_t empty_poll_stats;
-	/**< Number of empty polls */
+	uint64_t n_queues_ready_to_sleep;
+	/**< Number of queues ready to enter power optimized state */
+	uint64_t sleep_target;
+	/**< Prevent a queue from triggering sleep multiple times */
 } __rte_cache_aligned;
+static struct pmd_core_cfg lcore_cfgs[RTE_MAX_LCORE];
+
+static inline bool
+queue_equal(const union queue *l, const union queue *r)
+{
+	return l->val == r->val;
+}
+
+static inline void
+queue_copy(union queue *dst, const union queue *src)
+{
+	dst->val = src->val;
+}
 
-static struct pmd_queue_cfg port_cfg[RTE_MAX_ETHPORTS][RTE_MAX_QUEUES_PER_PORT];
+static struct queue_list_entry *
+queue_list_find(const struct pmd_core_cfg *cfg, const union queue *q)
+{
+	struct queue_list_entry *cur;
+
+	TAILQ_FOREACH(cur, &cfg->head, next) {
+		if (queue_equal(&cur->queue, q))
+			return cur;
+	}
+	return NULL;
+}
+
+static int
+queue_list_add(struct pmd_core_cfg *cfg, const union queue *q)
+{
+	struct queue_list_entry *qle;
+
+	/* is it already in the list? */
+	if (queue_list_find(cfg, q) != NULL)
+		return -EEXIST;
+
+	qle = malloc(sizeof(*qle));
+	if (qle == NULL)
+		return -ENOMEM;
+	memset(qle, 0, sizeof(*qle));
+
+	queue_copy(&qle->queue, q);
+	TAILQ_INSERT_TAIL(&cfg->head, qle, next);
+	cfg->n_queues++;
+
+	return 0;
+}
+
+static struct queue_list_entry *
+queue_list_take(struct pmd_core_cfg *cfg, const union queue *q)
+{
+	struct queue_list_entry *found;
+
+	found = queue_list_find(cfg, q);
+	if (found == NULL)
+		return NULL;
+
+	TAILQ_REMOVE(&cfg->head, found, next);
+	cfg->n_queues--;
+
+	/* freeing is responsibility of the caller */
+	return found;
+}
+
+static inline int
+get_monitor_addresses(struct pmd_core_cfg *cfg,
+		struct rte_power_monitor_cond *pmc, size_t len)
+{
+	const struct queue_list_entry *qle;
+	size_t i = 0;
+	int ret;
+
+	TAILQ_FOREACH(qle, &cfg->head, next) {
+		const union queue *q = &qle->queue;
+		struct rte_power_monitor_cond *cur;
+
+		/* attempted out of bounds access */
+		if (i >= len) {
+			RTE_LOG(ERR, POWER, "Too many queues being monitored\n");
+			return -1;
+		}
+
+		cur = &pmc[i++];
+		ret = rte_eth_get_monitor_addr(q->portid, q->qid, cur);
+		if (ret < 0)
+			return ret;
+	}
+	return 0;
+}
 
 static void
 calc_tsc(void)
@@ -76,113 +180,307 @@ calc_tsc(void)
 	}
 }
 
+static inline void
+queue_reset(struct pmd_core_cfg *cfg, struct queue_list_entry *qcfg)
+{
+	const bool is_ready_to_sleep = qcfg->n_sleeps == cfg->sleep_target;
+
+	/* reset empty poll counter for this queue */
+	qcfg->n_empty_polls = 0;
+	/* reset the queue sleep counter as well */
+	qcfg->n_sleeps = 0;
+	/* remove the queue from list of queues ready to sleep */
+	if (is_ready_to_sleep)
+		cfg->n_queues_ready_to_sleep--;
+	/*
+	 * no need change the lcore sleep target counter because this lcore will
+	 * reach the n_sleeps anyway, and the other cores are already counted so
+	 * there's no need to do anything else.
+	 */
+}
+
+static inline bool
+queue_can_sleep(struct pmd_core_cfg *cfg, struct queue_list_entry *qcfg)
+{
+	/* this function is called - that means we have an empty poll */
+	qcfg->n_empty_polls++;
+
+	/* if we haven't reached threshold for empty polls, we can't sleep */
+	if (qcfg->n_empty_polls <= EMPTYPOLL_MAX)
+		return false;
+
+	/*
+	 * we've reached a point where we are able to sleep, but we still need
+	 * to check if this queue has already been marked for sleeping.
+	 */
+	if (qcfg->n_sleeps == cfg->sleep_target)
+		return true;
+
+	/* mark this queue as ready for sleep */
+	qcfg->n_sleeps = cfg->sleep_target;
+	cfg->n_queues_ready_to_sleep++;
+
+	return true;
+}
+
+static inline bool
+lcore_can_sleep(struct pmd_core_cfg *cfg)
+{
+	/* are all queues ready to sleep? */
+	if (cfg->n_queues_ready_to_sleep != cfg->n_queues)
+		return false;
+
+	/* we've reached an iteration where we can sleep, reset sleep counter */
+	cfg->n_queues_ready_to_sleep = 0;
+	cfg->sleep_target++;
+	/*
+	 * we do not reset any individual queue empty poll counters, because
+	 * we want to keep sleeping on every poll until we actually get traffic.
+	 */
+
+	return true;
+}
+
 static uint16_t
-clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
-		uint16_t nb_rx, uint16_t max_pkts __rte_unused,
-		void *addr __rte_unused)
+clb_multiwait(uint16_t port_id __rte_unused, uint16_t qidx __rte_unused,
+		struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
+		uint16_t max_pkts __rte_unused, void *arg)
 {
+	const unsigned int lcore = rte_lcore_id();
+	struct queue_list_entry *queue_conf = arg;
+	struct pmd_core_cfg *lcore_conf;
+	const bool empty = nb_rx == 0;
+
+	lcore_conf = &lcore_cfgs[lcore];
+
+	/* early exit */
+	if (likely(!empty))
+		/* early exit */
+		queue_reset(lcore_conf, queue_conf);
+	else {
+		struct rte_power_monitor_cond pmc[lcore_conf->n_queues];
+		int ret;
+
+		/* can this queue sleep? */
+		if (!queue_can_sleep(lcore_conf, queue_conf))
+			return nb_rx;
+
+		/* can this lcore sleep? */
+		if (!lcore_can_sleep(lcore_conf))
+			return nb_rx;
+
+		/* gather all monitoring conditions */
+		ret = get_monitor_addresses(lcore_conf, pmc,
+				lcore_conf->n_queues);
+		if (ret < 0)
+			return nb_rx;
+
+		rte_power_monitor_multi(pmc, lcore_conf->n_queues, UINT64_MAX);
+	}
 
-	struct pmd_queue_cfg *q_conf;
+	return nb_rx;
+}
 
-	q_conf = &port_cfg[port_id][qidx];
+static uint16_t
+clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
+		uint16_t nb_rx, uint16_t max_pkts __rte_unused, void *arg)
+{
+	struct queue_list_entry *queue_conf = arg;
 
+	/* this callback can't do more than one queue, omit multiqueue logic */
 	if (unlikely(nb_rx == 0)) {
-		q_conf->empty_poll_stats++;
-		if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
+		queue_conf->n_empty_polls++;
+		if (unlikely(queue_conf->n_empty_polls > EMPTYPOLL_MAX)) {
 			struct rte_power_monitor_cond pmc;
-			uint16_t ret;
-
-			/*
-			 * we might get a cancellation request while being
-			 * inside the callback, in which case the wakeup
-			 * wouldn't work because it would've arrived too early.
-			 *
-			 * to get around this, we notify the other thread that
-			 * we're sleeping, so that it can spin until we're done.
-			 * unsolicited wakeups are perfectly safe.
-			 */
-			q_conf->umwait_in_progress = true;
-
-			rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
-
-			/* check if we need to cancel sleep */
-			if (q_conf->pwr_mgmt_state == PMD_MGMT_ENABLED) {
-				/* use monitoring condition to sleep */
-				ret = rte_eth_get_monitor_addr(port_id, qidx,
-						&pmc);
-				if (ret == 0)
-					rte_power_monitor(&pmc, UINT64_MAX);
-			}
-			q_conf->umwait_in_progress = false;
-
-			rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
+			int ret;
+
+			/* use monitoring condition to sleep */
+			ret = rte_eth_get_monitor_addr(port_id, qidx,
+					&pmc);
+			if (ret == 0)
+				rte_power_monitor(&pmc, UINT64_MAX);
 		}
 	} else
-		q_conf->empty_poll_stats = 0;
+		queue_conf->n_empty_polls = 0;
 
 	return nb_rx;
 }
 
 static uint16_t
-clb_pause(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
-		uint16_t nb_rx, uint16_t max_pkts __rte_unused,
-		void *addr __rte_unused)
+clb_pause(uint16_t port_id __rte_unused, uint16_t qidx __rte_unused,
+		struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
+		uint16_t max_pkts __rte_unused, void *arg)
 {
-	struct pmd_queue_cfg *q_conf;
-
-	q_conf = &port_cfg[port_id][qidx];
-
-	if (unlikely(nb_rx == 0)) {
-		q_conf->empty_poll_stats++;
-		/* sleep for 1 microsecond */
-		if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
-			/* use tpause if we have it */
-			if (global_data.intrinsics_support.power_pause) {
-				const uint64_t cur = rte_rdtsc();
-				const uint64_t wait_tsc =
-						cur + global_data.tsc_per_us;
-				rte_power_pause(wait_tsc);
-			} else {
-				uint64_t i;
-				for (i = 0; i < global_data.pause_per_us; i++)
-					rte_pause();
-			}
+	const unsigned int lcore = rte_lcore_id();
+	struct queue_list_entry *queue_conf = arg;
+	struct pmd_core_cfg *lcore_conf;
+	const bool empty = nb_rx == 0;
+
+	lcore_conf = &lcore_cfgs[lcore];
+
+	if (likely(!empty))
+		/* early exit */
+		queue_reset(lcore_conf, queue_conf);
+	else {
+		/* can this queue sleep? */
+		if (!queue_can_sleep(lcore_conf, queue_conf))
+			return nb_rx;
+
+		/* can this lcore sleep? */
+		if (!lcore_can_sleep(lcore_conf))
+			return nb_rx;
+
+		/* sleep for 1 microsecond, use tpause if we have it */
+		if (global_data.intrinsics_support.power_pause) {
+			const uint64_t cur = rte_rdtsc();
+			const uint64_t wait_tsc =
+					cur + global_data.tsc_per_us;
+			rte_power_pause(wait_tsc);
+		} else {
+			uint64_t i;
+			for (i = 0; i < global_data.pause_per_us; i++)
+				rte_pause();
 		}
-	} else
-		q_conf->empty_poll_stats = 0;
+	}
 
 	return nb_rx;
 }
 
 static uint16_t
-clb_scale_freq(uint16_t port_id, uint16_t qidx,
+clb_scale_freq(uint16_t port_id __rte_unused, uint16_t qidx __rte_unused,
 		struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
-		uint16_t max_pkts __rte_unused, void *_  __rte_unused)
+		uint16_t max_pkts __rte_unused, void *arg)
 {
-	struct pmd_queue_cfg *q_conf;
+	const unsigned int lcore = rte_lcore_id();
+	const bool empty = nb_rx == 0;
+	struct pmd_core_cfg *lcore_conf = &lcore_cfgs[lcore];
+	struct queue_list_entry *queue_conf = arg;
 
-	q_conf = &port_cfg[port_id][qidx];
+	if (likely(!empty)) {
+		/* early exit */
+		queue_reset(lcore_conf, queue_conf);
 
-	if (unlikely(nb_rx == 0)) {
-		q_conf->empty_poll_stats++;
-		if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX))
-			/* scale down freq */
-			rte_power_freq_min(rte_lcore_id());
-	} else {
-		q_conf->empty_poll_stats = 0;
-		/* scale up freq */
+		/* scale up freq immediately */
 		rte_power_freq_max(rte_lcore_id());
+	} else {
+		/* can this queue sleep? */
+		if (!queue_can_sleep(lcore_conf, queue_conf))
+			return nb_rx;
+
+		/* can this lcore sleep? */
+		if (!lcore_can_sleep(lcore_conf))
+			return nb_rx;
+
+		rte_power_freq_min(rte_lcore_id());
 	}
 
 	return nb_rx;
 }
 
+static int
+queue_stopped(const uint16_t port_id, const uint16_t queue_id)
+{
+	struct rte_eth_rxq_info qinfo;
+
+	int ret = rte_eth_rx_queue_info_get(port_id, queue_id, &qinfo);
+	if (ret < 0) {
+		if (ret == -ENOTSUP)
+			return 1;
+		else
+			return -1;
+	}
+
+	return qinfo.queue_state == RTE_ETH_QUEUE_STATE_STOPPED;
+}
+
+static int
+cfg_queues_stopped(struct pmd_core_cfg *queue_cfg)
+{
+	const struct queue_list_entry *entry;
+
+	TAILQ_FOREACH(entry, &queue_cfg->head, next) {
+		const union queue *q = &entry->queue;
+		int ret = queue_stopped(q->portid, q->qid);
+		if (ret != 1)
+			return ret;
+	}
+	return 1;
+}
+
+static int
+check_scale(unsigned int lcore)
+{
+	enum power_management_env env;
+
+	/* only PSTATE and ACPI modes are supported */
+	if (!rte_power_check_env_supported(PM_ENV_ACPI_CPUFREQ) &&
+	    !rte_power_check_env_supported(PM_ENV_PSTATE_CPUFREQ)) {
+		RTE_LOG(DEBUG, POWER, "Neither ACPI nor PSTATE modes are supported\n");
+		return -ENOTSUP;
+	}
+	/* ensure we could initialize the power library */
+	if (rte_power_init(lcore))
+		return -EINVAL;
+
+	/* ensure we initialized the correct env */
+	env = rte_power_get_env();
+	if (env != PM_ENV_ACPI_CPUFREQ && env != PM_ENV_PSTATE_CPUFREQ) {
+		RTE_LOG(DEBUG, POWER, "Neither ACPI nor PSTATE modes were initialized\n");
+		return -ENOTSUP;
+	}
+
+	/* we're done */
+	return 0;
+}
+
+static int
+check_monitor(struct pmd_core_cfg *cfg, const union queue *qdata)
+{
+	struct rte_power_monitor_cond dummy;
+	bool multimonitor_supported;
+
+	/* check if rte_power_monitor is supported */
+	if (!global_data.intrinsics_support.power_monitor) {
+		RTE_LOG(DEBUG, POWER, "Monitoring intrinsics are not supported\n");
+		return -ENOTSUP;
+	}
+	/* check if multi-monitor is supported */
+	multimonitor_supported =
+			global_data.intrinsics_support.power_monitor_multi;
+
+	/* if we're adding a new queue, do we support multiple queues? */
+	if (cfg->n_queues > 0 && !multimonitor_supported) {
+		RTE_LOG(DEBUG, POWER, "Monitoring multiple queues is not supported\n");
+		return -ENOTSUP;
+	}
+
+	/* check if the device supports the necessary PMD API */
+	if (rte_eth_get_monitor_addr(qdata->portid, qdata->qid,
+			&dummy) == -ENOTSUP) {
+		RTE_LOG(DEBUG, POWER, "The device does not support rte_eth_get_monitor_addr\n");
+		return -ENOTSUP;
+	}
+
+	/* we're done */
+	return 0;
+}
+
+static inline rte_rx_callback_fn
+get_monitor_callback(void)
+{
+	return global_data.intrinsics_support.power_monitor_multi ?
+		clb_multiwait : clb_umwait;
+}
+
 int
 rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
 		uint16_t queue_id, enum rte_power_pmd_mgmt_type mode)
 {
-	struct pmd_queue_cfg *queue_cfg;
+	const union queue qdata = {.portid = port_id, .qid = queue_id};
+	struct pmd_core_cfg *lcore_cfg;
+	struct queue_list_entry *queue_cfg;
 	struct rte_eth_dev_info info;
+	rte_rx_callback_fn clb;
 	int ret;
 
 	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
@@ -203,9 +501,27 @@ rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
 		goto end;
 	}
 
-	queue_cfg = &port_cfg[port_id][queue_id];
+	/* check if the queue is stopped */
+	ret = queue_stopped(port_id, queue_id);
+	if (ret != 1) {
+		/* error means invalid queue, 0 means queue wasn't stopped */
+		ret = ret < 0 ? -EINVAL : -EBUSY;
+		goto end;
+	}
+
+	lcore_cfg = &lcore_cfgs[lcore_id];
 
-	if (queue_cfg->pwr_mgmt_state != PMD_MGMT_DISABLED) {
+	/* check if other queues are stopped as well */
+	ret = cfg_queues_stopped(lcore_cfg);
+	if (ret != 1) {
+		/* error means invalid queue, 0 means queue wasn't stopped */
+		ret = ret < 0 ? -EINVAL : -EBUSY;
+		goto end;
+	}
+
+	/* if callback was already enabled, check current callback type */
+	if (lcore_cfg->pwr_mgmt_state != PMD_MGMT_DISABLED &&
+			lcore_cfg->cb_mode != mode) {
 		ret = -EINVAL;
 		goto end;
 	}
@@ -215,89 +531,58 @@ rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
 
 	switch (mode) {
 	case RTE_POWER_MGMT_TYPE_MONITOR:
-	{
-		struct rte_power_monitor_cond dummy;
-
-		/* check if rte_power_monitor is supported */
-		if (!global_data.intrinsics_support.power_monitor) {
-			RTE_LOG(DEBUG, POWER, "Monitoring intrinsics are not supported\n");
-			ret = -ENOTSUP;
-			goto end;
-		}
-
-		/* check if the device supports the necessary PMD API */
-		if (rte_eth_get_monitor_addr(port_id, queue_id,
-				&dummy) == -ENOTSUP) {
-			RTE_LOG(DEBUG, POWER, "The device does not support rte_eth_get_monitor_addr\n");
-			ret = -ENOTSUP;
+		/* check if we can add a new queue */
+		ret = check_monitor(lcore_cfg, &qdata);
+		if (ret < 0)
 			goto end;
-		}
-		/* initialize data before enabling the callback */
-		queue_cfg->empty_poll_stats = 0;
-		queue_cfg->cb_mode = mode;
-		queue_cfg->umwait_in_progress = false;
-		queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
-
-		/* ensure we update our state before callback starts */
-		rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
 
-		queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
-				clb_umwait, NULL);
+		clb = get_monitor_callback();
 		break;
-	}
 	case RTE_POWER_MGMT_TYPE_SCALE:
-	{
-		enum power_management_env env;
-		/* only PSTATE and ACPI modes are supported */
-		if (!rte_power_check_env_supported(PM_ENV_ACPI_CPUFREQ) &&
-				!rte_power_check_env_supported(
-					PM_ENV_PSTATE_CPUFREQ)) {
-			RTE_LOG(DEBUG, POWER, "Neither ACPI nor PSTATE modes are supported\n");
-			ret = -ENOTSUP;
-			goto end;
-		}
-		/* ensure we could initialize the power library */
-		if (rte_power_init(lcore_id)) {
-			ret = -EINVAL;
+		clb = clb_scale_freq;
+
+		/* we only have to check this when enabling first queue */
+		if (lcore_cfg->pwr_mgmt_state != PMD_MGMT_DISABLED)
+			break;
+		/* check if we can add a new queue */
+		ret = check_scale(lcore_id);
+		if (ret < 0)
 			goto end;
-		}
-		/* ensure we initialized the correct env */
-		env = rte_power_get_env();
-		if (env != PM_ENV_ACPI_CPUFREQ &&
-				env != PM_ENV_PSTATE_CPUFREQ) {
-			RTE_LOG(DEBUG, POWER, "Neither ACPI nor PSTATE modes were initialized\n");
-			ret = -ENOTSUP;
-			goto end;
-		}
-		/* initialize data before enabling the callback */
-		queue_cfg->empty_poll_stats = 0;
-		queue_cfg->cb_mode = mode;
-		queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
-
-		/* this is not necessary here, but do it anyway */
-		rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
-
-		queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id,
-				queue_id, clb_scale_freq, NULL);
 		break;
-	}
 	case RTE_POWER_MGMT_TYPE_PAUSE:
 		/* figure out various time-to-tsc conversions */
 		if (global_data.tsc_per_us == 0)
 			calc_tsc();
 
-		/* initialize data before enabling the callback */
-		queue_cfg->empty_poll_stats = 0;
-		queue_cfg->cb_mode = mode;
-		queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
+		clb = clb_pause;
+		break;
+	default:
+		RTE_LOG(DEBUG, POWER, "Invalid power management type\n");
+		ret = -EINVAL;
+		goto end;
+	}
+	/* add this queue to the list */
+	ret = queue_list_add(lcore_cfg, &qdata);
+	if (ret < 0) {
+		RTE_LOG(DEBUG, POWER, "Failed to add queue to list: %s\n",
+				strerror(-ret));
+		goto end;
+	}
+	/* new queue is always added last */
+	queue_cfg = TAILQ_LAST(&lcore_cfg->head, queue_list_head);
 
-		/* this is not necessary here, but do it anyway */
-		rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
+	/* when enabling first queue, ensure sleep target is not 0 */
+	if (lcore_cfg->n_queues == 1 && lcore_cfg->sleep_target == 0)
+		lcore_cfg->sleep_target = 1;
 
-		queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
-				clb_pause, NULL);
-		break;
+	/* initialize data before enabling the callback */
+	if (lcore_cfg->n_queues == 1) {
+		lcore_cfg->cb_mode = mode;
+		lcore_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
 	}
+	queue_cfg->cb = rte_eth_add_rx_callback(port_id, queue_id,
+			clb, queue_cfg);
+
 	ret = 0;
 end:
 	return ret;
@@ -307,59 +592,81 @@ int
 rte_power_ethdev_pmgmt_queue_disable(unsigned int lcore_id,
 		uint16_t port_id, uint16_t queue_id)
 {
-	struct pmd_queue_cfg *queue_cfg;
+	const union queue qdata = {.portid = port_id, .qid = queue_id};
+	struct pmd_core_cfg *lcore_cfg;
+	struct queue_list_entry *queue_cfg;
+	int ret;
 
 	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
 
 	if (lcore_id >= RTE_MAX_LCORE || queue_id >= RTE_MAX_QUEUES_PER_PORT)
 		return -EINVAL;
 
+	/* check if the queue is stopped */
+	ret = queue_stopped(port_id, queue_id);
+	if (ret != 1) {
+		/* error means invalid queue, 0 means queue wasn't stopped */
+		return ret < 0 ? -EINVAL : -EBUSY;
+	}
+
 	/* no need to check queue id as wrong queue id would not be enabled */
-	queue_cfg = &port_cfg[port_id][queue_id];
+	lcore_cfg = &lcore_cfgs[lcore_id];
+
+	/* check if other queues are stopped as well */
+	ret = cfg_queues_stopped(lcore_cfg);
+	if (ret != 1) {
+		/* error means invalid queue, 0 means queue wasn't stopped */
+		return ret < 0 ? -EINVAL : -EBUSY;
+	}
 
-	if (queue_cfg->pwr_mgmt_state != PMD_MGMT_ENABLED)
+	if (lcore_cfg->pwr_mgmt_state != PMD_MGMT_ENABLED)
 		return -EINVAL;
 
-	/* stop any callbacks from progressing */
-	queue_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED;
+	/*
+	 * There is no good/easy way to do this without race conditions, so we
+	 * are just going to throw our hands in the air and hope that the user
+	 * has read the documentation and has ensured that ports are stopped at
+	 * the time we enter the API functions.
+	 */
+	queue_cfg = queue_list_take(lcore_cfg, &qdata);
+	if (queue_cfg == NULL)
+		return -ENOENT;
 
-	/* ensure we update our state before continuing */
-	rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
+	/* if we've removed all queues from the lists, set state to disabled */
+	if (lcore_cfg->n_queues == 0)
+		lcore_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED;
 
-	switch (queue_cfg->cb_mode) {
-	case RTE_POWER_MGMT_TYPE_MONITOR:
-	{
-		bool exit = false;
-		do {
-			/*
-			 * we may request cancellation while the other thread
-			 * has just entered the callback but hasn't started
-			 * sleeping yet, so keep waking it up until we know it's
-			 * done sleeping.
-			 */
-			if (queue_cfg->umwait_in_progress)
-				rte_power_monitor_wakeup(lcore_id);
-			else
-				exit = true;
-		} while (!exit);
-	}
-	/* fall-through */
+	switch (lcore_cfg->cb_mode) {
+	case RTE_POWER_MGMT_TYPE_MONITOR: /* fall-through */
 	case RTE_POWER_MGMT_TYPE_PAUSE:
-		rte_eth_remove_rx_callback(port_id, queue_id,
-				queue_cfg->cur_cb);
+		rte_eth_remove_rx_callback(port_id, queue_id, queue_cfg->cb);
 		break;
 	case RTE_POWER_MGMT_TYPE_SCALE:
-		rte_power_freq_max(lcore_id);
-		rte_eth_remove_rx_callback(port_id, queue_id,
-				queue_cfg->cur_cb);
-		rte_power_exit(lcore_id);
+		rte_eth_remove_rx_callback(port_id, queue_id, queue_cfg->cb);
+		/* disable power library on this lcore if this was last queue */
+		if (lcore_cfg->pwr_mgmt_state == PMD_MGMT_DISABLED) {
+			rte_power_freq_max(lcore_id);
+			rte_power_exit(lcore_id);
+		}
 		break;
 	}
 	/*
-	 * we don't free the RX callback here because it is unsafe to do so
-	 * unless we know for a fact that all data plane threads have stopped.
+	 * the API doc mandates that the user stops all processing on affected
+	 * ports before calling any of these API's, so we can assume that the
+	 * callbacks can be freed. we're intentionally casting away const-ness.
 	 */
-	queue_cfg->cur_cb = NULL;
+	rte_free((void *)queue_cfg->cb);
+	free(queue_cfg);
 
 	return 0;
 }
+
+RTE_INIT(rte_power_ethdev_pmgmt_init) {
+	size_t i;
+
+	/* initialize all tailqs */
+	for (i = 0; i < RTE_DIM(lcore_cfgs); i++) {
+		struct pmd_core_cfg *cfg = &lcore_cfgs[i];
+		TAILQ_INIT(&cfg->head);
+	}
+}