mirror of https://github.com/OpenIPC/firmware.git
3303 lines
94 KiB
Diff
3303 lines
94 KiB
Diff
diff -drupN a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
--- a/kernel/sched/fair.c 2018-08-06 17:23:04.000000000 +0300
|
|
+++ b/kernel/sched/fair.c 2022-06-12 05:28:14.000000000 +0300
|
|
@@ -30,10 +30,13 @@
|
|
#include <linux/mempolicy.h>
|
|
#include <linux/migrate.h>
|
|
#include <linux/task_work.h>
|
|
+#include <linux/module.h>
|
|
|
|
#include <trace/events/sched.h>
|
|
|
|
#include "sched.h"
|
|
+#include "tune.h"
|
|
+#include "walt.h"
|
|
|
|
/*
|
|
* Targeted preemption latency for CPU-bound tasks:
|
|
@@ -50,6 +53,15 @@
|
|
unsigned int sysctl_sched_latency = 6000000ULL;
|
|
unsigned int normalized_sysctl_sched_latency = 6000000ULL;
|
|
|
|
+unsigned int sysctl_sched_sync_hint_enable = 1;
|
|
+unsigned int sysctl_sched_cstate_aware = 1;
|
|
+
|
|
+#ifdef CONFIG_SCHED_WALT
|
|
+unsigned int sysctl_sched_use_walt_cpu_util = 1;
|
|
+unsigned int sysctl_sched_use_walt_task_util = 1;
|
|
+__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
|
|
+ (10 * NSEC_PER_MSEC);
|
|
+#endif
|
|
/*
|
|
* The initial- and re-scaling of tunables is configurable
|
|
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
|
|
@@ -116,7 +128,7 @@ unsigned int sysctl_sched_cfs_bandwidth_
|
|
|
|
/*
|
|
* The margin used when comparing utilization with CPU capacity:
|
|
- * util * 1024 < capacity * margin
|
|
+ * util * margin < capacity * 1024
|
|
*/
|
|
unsigned int capacity_margin = 1280; /* ~20% */
|
|
|
|
@@ -290,19 +302,59 @@ static inline struct cfs_rq *group_cfs_r
|
|
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
|
|
{
|
|
if (!cfs_rq->on_list) {
|
|
+ struct rq *rq = rq_of(cfs_rq);
|
|
+ int cpu = cpu_of(rq);
|
|
/*
|
|
* Ensure we either appear before our parent (if already
|
|
* enqueued) or force our parent to appear after us when it is
|
|
- * enqueued. The fact that we always enqueue bottom-up
|
|
- * reduces this to two cases.
|
|
+ * enqueued. The fact that we always enqueue bottom-up
|
|
+ * reduces this to two cases and a special case for the root
|
|
+ * cfs_rq. Furthermore, it also means that we will always reset
|
|
+ * tmp_alone_branch either when the branch is connected
|
|
+ * to a tree or when we reach the beg of the tree
|
|
*/
|
|
if (cfs_rq->tg->parent &&
|
|
- cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
|
|
- list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
|
|
- &rq_of(cfs_rq)->leaf_cfs_rq_list);
|
|
- } else {
|
|
+ cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
|
|
+ /*
|
|
+ * If parent is already on the list, we add the child
|
|
+ * just before. Thanks to circular linked property of
|
|
+ * the list, this means to put the child at the tail
|
|
+ * of the list that starts by parent.
|
|
+ */
|
|
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
|
|
- &rq_of(cfs_rq)->leaf_cfs_rq_list);
|
|
+ &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
|
|
+ /*
|
|
+ * The branch is now connected to its tree so we can
|
|
+ * reset tmp_alone_branch to the beginning of the
|
|
+ * list.
|
|
+ */
|
|
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
|
|
+ } else if (!cfs_rq->tg->parent) {
|
|
+ /*
|
|
+ * cfs rq without parent should be put
|
|
+ * at the tail of the list.
|
|
+ */
|
|
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
|
|
+ &rq->leaf_cfs_rq_list);
|
|
+ /*
|
|
+ * We have reach the beg of a tree so we can reset
|
|
+ * tmp_alone_branch to the beginning of the list.
|
|
+ */
|
|
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
|
|
+ } else {
|
|
+ /*
|
|
+ * The parent has not already been added so we want to
|
|
+ * make sure that it will be put after us.
|
|
+ * tmp_alone_branch points to the beg of the branch
|
|
+ * where we will add parent.
|
|
+ */
|
|
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
|
|
+ rq->tmp_alone_branch);
|
|
+ /*
|
|
+ * update tmp_alone_branch to points to the new beg
|
|
+ * of the branch
|
|
+ */
|
|
+ rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
|
|
}
|
|
|
|
cfs_rq->on_list = 1;
|
|
@@ -699,6 +751,7 @@ void init_entity_runnable_average(struct
|
|
if (entity_is_task(se))
|
|
sa->load_avg = scale_load_down(se->load.weight);
|
|
sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
|
|
+
|
|
/*
|
|
* At this point, util_avg won't be used in select_task_rq_fair anyway
|
|
*/
|
|
@@ -708,9 +761,7 @@ void init_entity_runnable_average(struct
|
|
}
|
|
|
|
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
|
|
-static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
|
|
-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
|
|
-static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
|
|
+static void attach_entity_cfs_rq(struct sched_entity *se);
|
|
|
|
/*
|
|
* With new tasks being created, their initial util_avgs are extrapolated
|
|
@@ -742,7 +793,6 @@ void post_init_entity_util_avg(struct sc
|
|
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
struct sched_avg *sa = &se->avg;
|
|
long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
|
|
- u64 now = cfs_rq_clock_task(cfs_rq);
|
|
|
|
if (cap > 0) {
|
|
if (cfs_rq->avg.util_avg != 0) {
|
|
@@ -770,14 +820,12 @@ void post_init_entity_util_avg(struct sc
|
|
* such that the next switched_to_fair() has the
|
|
* expected state.
|
|
*/
|
|
- se->avg.last_update_time = now;
|
|
+ se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
|
|
return;
|
|
}
|
|
}
|
|
|
|
- update_cfs_rq_load_avg(now, cfs_rq, false);
|
|
- attach_entity_load_avg(cfs_rq, se);
|
|
- update_tg_load_avg(cfs_rq, false);
|
|
+ attach_entity_cfs_rq(se);
|
|
}
|
|
|
|
#else /* !CONFIG_SMP */
|
|
@@ -937,6 +985,7 @@ update_stats_enqueue_sleeper(struct cfs_
|
|
}
|
|
|
|
trace_sched_stat_blocked(tsk, delta);
|
|
+ trace_sched_blocked_reason(tsk);
|
|
|
|
/*
|
|
* Blocking time is in units of nanosecs, so shift by
|
|
@@ -2646,16 +2695,20 @@ static void reweight_entity(struct cfs_r
|
|
|
|
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
|
|
|
|
-static void update_cfs_shares(struct cfs_rq *cfs_rq)
|
|
+static void update_cfs_shares(struct sched_entity *se)
|
|
{
|
|
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
|
|
struct task_group *tg;
|
|
- struct sched_entity *se;
|
|
long shares;
|
|
|
|
- tg = cfs_rq->tg;
|
|
- se = tg->se[cpu_of(rq_of(cfs_rq))];
|
|
- if (!se || throttled_hierarchy(cfs_rq))
|
|
+ if (!cfs_rq)
|
|
return;
|
|
+
|
|
+ if (throttled_hierarchy(cfs_rq))
|
|
+ return;
|
|
+
|
|
+ tg = cfs_rq->tg;
|
|
+
|
|
#ifndef CONFIG_SMP
|
|
if (likely(se->load.weight == tg->shares))
|
|
return;
|
|
@@ -2664,8 +2717,9 @@ static void update_cfs_shares(struct cfs
|
|
|
|
reweight_entity(cfs_rq_of(se), se, shares);
|
|
}
|
|
+
|
|
#else /* CONFIG_FAIR_GROUP_SCHED */
|
|
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
|
|
+static inline void update_cfs_shares(struct sched_entity *se)
|
|
{
|
|
}
|
|
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
|
@@ -2816,6 +2870,7 @@ __update_load_avg(u64 now, int cpu, stru
|
|
|
|
scale_freq = arch_scale_freq_capacity(NULL, cpu);
|
|
scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
|
|
+ trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu);
|
|
|
|
/* delta_w is the amount already accumulated against our next period */
|
|
delta_w = sa->period_contrib;
|
|
@@ -2891,6 +2946,26 @@ __update_load_avg(u64 now, int cpu, stru
|
|
return decayed;
|
|
}
|
|
|
|
+/*
|
|
+ * Signed add and clamp on underflow.
|
|
+ *
|
|
+ * Explicitly do a load-store to ensure the intermediate value never hits
|
|
+ * memory. This allows lockless observations without ever seeing the negative
|
|
+ * values.
|
|
+ */
|
|
+#define add_positive(_ptr, _val) do { \
|
|
+ typeof(_ptr) ptr = (_ptr); \
|
|
+ typeof(_val) val = (_val); \
|
|
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
|
|
+ \
|
|
+ res = var + val; \
|
|
+ \
|
|
+ if (val < 0 && res > var) \
|
|
+ res = 0; \
|
|
+ \
|
|
+ WRITE_ONCE(*ptr, res); \
|
|
+} while (0)
|
|
+
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
/**
|
|
* update_tg_load_avg - update the tg's load avg
|
|
@@ -2970,8 +3045,168 @@ void set_task_rq_fair(struct sched_entit
|
|
se->avg.last_update_time = n_last_update_time;
|
|
}
|
|
}
|
|
+
|
|
+/* Take into account change of utilization of a child task group */
|
|
+static inline void
|
|
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
+{
|
|
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
|
|
+ long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
|
|
+
|
|
+ /* Nothing to update */
|
|
+ if (!delta)
|
|
+ return;
|
|
+
|
|
+ /* Set new sched_entity's utilization */
|
|
+ se->avg.util_avg = gcfs_rq->avg.util_avg;
|
|
+ se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
|
|
+
|
|
+ /* Update parent cfs_rq utilization */
|
|
+ add_positive(&cfs_rq->avg.util_avg, delta);
|
|
+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
|
|
+}
|
|
+
|
|
+/* Take into account change of load of a child task group */
|
|
+static inline void
|
|
+update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
+{
|
|
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
|
|
+ long delta, load = gcfs_rq->avg.load_avg;
|
|
+
|
|
+ /*
|
|
+ * If the load of group cfs_rq is null, the load of the
|
|
+ * sched_entity will also be null so we can skip the formula
|
|
+ */
|
|
+ if (load) {
|
|
+ long tg_load;
|
|
+
|
|
+ /* Get tg's load and ensure tg_load > 0 */
|
|
+ tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
|
|
+
|
|
+ /* Ensure tg_load >= load and updated with current load*/
|
|
+ tg_load -= gcfs_rq->tg_load_avg_contrib;
|
|
+ tg_load += load;
|
|
+
|
|
+ /*
|
|
+ * We need to compute a correction term in the case that the
|
|
+ * task group is consuming more CPU than a task of equal
|
|
+ * weight. A task with a weight equals to tg->shares will have
|
|
+ * a load less or equal to scale_load_down(tg->shares).
|
|
+ * Similarly, the sched_entities that represent the task group
|
|
+ * at parent level, can't have a load higher than
|
|
+ * scale_load_down(tg->shares). And the Sum of sched_entities'
|
|
+ * load must be <= scale_load_down(tg->shares).
|
|
+ */
|
|
+ if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
|
|
+ /* scale gcfs_rq's load into tg's shares*/
|
|
+ load *= scale_load_down(gcfs_rq->tg->shares);
|
|
+ load /= tg_load;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ delta = load - se->avg.load_avg;
|
|
+
|
|
+ /* Nothing to update */
|
|
+ if (!delta)
|
|
+ return;
|
|
+
|
|
+ /* Set new sched_entity's load */
|
|
+ se->avg.load_avg = load;
|
|
+ se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
|
|
+
|
|
+ /* Update parent cfs_rq load */
|
|
+ add_positive(&cfs_rq->avg.load_avg, delta);
|
|
+ cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
|
|
+
|
|
+ /*
|
|
+ * If the sched_entity is already enqueued, we also have to update the
|
|
+ * runnable load avg.
|
|
+ */
|
|
+ if (se->on_rq) {
|
|
+ /* Update parent cfs_rq runnable_load_avg */
|
|
+ add_positive(&cfs_rq->runnable_load_avg, delta);
|
|
+ cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
|
|
+{
|
|
+ cfs_rq->propagate_avg = 1;
|
|
+}
|
|
+
|
|
+static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
|
|
+{
|
|
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
|
|
+
|
|
+ if (!cfs_rq->propagate_avg)
|
|
+ return 0;
|
|
+
|
|
+ cfs_rq->propagate_avg = 0;
|
|
+ return 1;
|
|
+}
|
|
+
|
|
+/* Update task and its cfs_rq load average */
|
|
+static inline int propagate_entity_load_avg(struct sched_entity *se)
|
|
+{
|
|
+ struct cfs_rq *cfs_rq;
|
|
+
|
|
+ if (entity_is_task(se))
|
|
+ return 0;
|
|
+
|
|
+ if (!test_and_clear_tg_cfs_propagate(se))
|
|
+ return 0;
|
|
+
|
|
+ cfs_rq = cfs_rq_of(se);
|
|
+
|
|
+ set_tg_cfs_propagate(cfs_rq);
|
|
+
|
|
+ update_tg_cfs_util(cfs_rq, se);
|
|
+ update_tg_cfs_load(cfs_rq, se);
|
|
+
|
|
+ return 1;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Check if we need to update the load and the utilization of a blocked
|
|
+ * group_entity:
|
|
+ */
|
|
+static inline bool skip_blocked_update(struct sched_entity *se)
|
|
+{
|
|
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
|
|
+
|
|
+ /*
|
|
+ * If sched_entity still have not zero load or utilization, we have to
|
|
+ * decay it:
|
|
+ */
|
|
+ if (se->avg.load_avg || se->avg.util_avg)
|
|
+ return false;
|
|
+
|
|
+ /*
|
|
+ * If there is a pending propagation, we have to update the load and
|
|
+ * the utilization of the sched_entity:
|
|
+ */
|
|
+ if (gcfs_rq->propagate_avg)
|
|
+ return false;
|
|
+
|
|
+ /*
|
|
+ * Otherwise, the load and the utilization of the sched_entity is
|
|
+ * already zero and there is no pending propagation, so it will be a
|
|
+ * waste of time to try to decay it:
|
|
+ */
|
|
+ return true;
|
|
+}
|
|
+
|
|
#else /* CONFIG_FAIR_GROUP_SCHED */
|
|
+
|
|
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
|
|
+
|
|
+static inline int propagate_entity_load_avg(struct sched_entity *se)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
|
|
+
|
|
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
|
|
|
static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
|
|
@@ -3042,6 +3277,7 @@ update_cfs_rq_load_avg(u64 now, struct c
|
|
sub_positive(&sa->load_avg, r);
|
|
sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
|
|
removed_load = 1;
|
|
+ set_tg_cfs_propagate(cfs_rq);
|
|
}
|
|
|
|
if (atomic_long_read(&cfs_rq->removed_util_avg)) {
|
|
@@ -3049,6 +3285,7 @@ update_cfs_rq_load_avg(u64 now, struct c
|
|
sub_positive(&sa->util_avg, r);
|
|
sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
|
|
removed_util = 1;
|
|
+ set_tg_cfs_propagate(cfs_rq);
|
|
}
|
|
|
|
decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
|
|
@@ -3062,27 +3299,51 @@ update_cfs_rq_load_avg(u64 now, struct c
|
|
if (update_freq && (decayed || removed_util))
|
|
cfs_rq_util_change(cfs_rq);
|
|
|
|
+ /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */
|
|
+ if (cfs_rq == &rq_of(cfs_rq)->cfs)
|
|
+ trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
|
|
+
|
|
return decayed || removed_load;
|
|
}
|
|
|
|
+/*
|
|
+ * Optional action to be done while updating the load average
|
|
+ */
|
|
+#define UPDATE_TG 0x1
|
|
+#define SKIP_AGE_LOAD 0x2
|
|
+
|
|
/* Update task and its cfs_rq load average */
|
|
-static inline void update_load_avg(struct sched_entity *se, int update_tg)
|
|
+static inline void update_load_avg(struct sched_entity *se, int flags)
|
|
{
|
|
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
u64 now = cfs_rq_clock_task(cfs_rq);
|
|
struct rq *rq = rq_of(cfs_rq);
|
|
int cpu = cpu_of(rq);
|
|
+ int decayed;
|
|
+ void *ptr = NULL;
|
|
|
|
/*
|
|
* Track task load average for carrying it to new CPU after migrated, and
|
|
* track group sched_entity load average for task_h_load calc in migration
|
|
*/
|
|
- __update_load_avg(now, cpu, &se->avg,
|
|
+ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
|
|
+ __update_load_avg(now, cpu, &se->avg,
|
|
se->on_rq * scale_load_down(se->load.weight),
|
|
cfs_rq->curr == se, NULL);
|
|
+ }
|
|
|
|
- if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
|
|
+ decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
|
|
+ decayed |= propagate_entity_load_avg(se);
|
|
+
|
|
+ if (decayed && (flags & UPDATE_TG))
|
|
update_tg_load_avg(cfs_rq, 0);
|
|
+
|
|
+ if (entity_is_task(se)) {
|
|
+#ifdef CONFIG_SCHED_WALT
|
|
+ ptr = (void *)&(task_of(se)->ravg);
|
|
+#endif
|
|
+ trace_sched_load_avg_task(task_of(se), &se->avg, ptr);
|
|
+ }
|
|
}
|
|
|
|
/**
|
|
@@ -3095,31 +3356,12 @@ static inline void update_load_avg(struc
|
|
*/
|
|
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
{
|
|
- if (!sched_feat(ATTACH_AGE_LOAD))
|
|
- goto skip_aging;
|
|
-
|
|
- /*
|
|
- * If we got migrated (either between CPUs or between cgroups) we'll
|
|
- * have aged the average right before clearing @last_update_time.
|
|
- *
|
|
- * Or we're fresh through post_init_entity_util_avg().
|
|
- */
|
|
- if (se->avg.last_update_time) {
|
|
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
|
|
- &se->avg, 0, 0, NULL);
|
|
-
|
|
- /*
|
|
- * XXX: we could have just aged the entire load away if we've been
|
|
- * absent from the fair class for too long.
|
|
- */
|
|
- }
|
|
-
|
|
-skip_aging:
|
|
se->avg.last_update_time = cfs_rq->avg.last_update_time;
|
|
cfs_rq->avg.load_avg += se->avg.load_avg;
|
|
cfs_rq->avg.load_sum += se->avg.load_sum;
|
|
cfs_rq->avg.util_avg += se->avg.util_avg;
|
|
cfs_rq->avg.util_sum += se->avg.util_sum;
|
|
+ set_tg_cfs_propagate(cfs_rq);
|
|
|
|
cfs_rq_util_change(cfs_rq);
|
|
}
|
|
@@ -3134,14 +3376,12 @@ skip_aging:
|
|
*/
|
|
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
{
|
|
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
|
|
- &se->avg, se->on_rq * scale_load_down(se->load.weight),
|
|
- cfs_rq->curr == se, NULL);
|
|
|
|
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
|
|
sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
|
|
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
|
|
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
|
|
+ set_tg_cfs_propagate(cfs_rq);
|
|
|
|
cfs_rq_util_change(cfs_rq);
|
|
}
|
|
@@ -3151,34 +3391,20 @@ static inline void
|
|
enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
{
|
|
struct sched_avg *sa = &se->avg;
|
|
- u64 now = cfs_rq_clock_task(cfs_rq);
|
|
- int migrated, decayed;
|
|
-
|
|
- migrated = !sa->last_update_time;
|
|
- if (!migrated) {
|
|
- __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
|
|
- se->on_rq * scale_load_down(se->load.weight),
|
|
- cfs_rq->curr == se, NULL);
|
|
- }
|
|
-
|
|
- decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
|
|
|
|
cfs_rq->runnable_load_avg += sa->load_avg;
|
|
cfs_rq->runnable_load_sum += sa->load_sum;
|
|
|
|
- if (migrated)
|
|
+ if (!sa->last_update_time) {
|
|
attach_entity_load_avg(cfs_rq, se);
|
|
-
|
|
- if (decayed || migrated)
|
|
update_tg_load_avg(cfs_rq, 0);
|
|
+ }
|
|
}
|
|
|
|
/* Remove the runnable load generated by se from cfs_rq's runnable load average */
|
|
static inline void
|
|
dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
{
|
|
- update_load_avg(se, 1);
|
|
-
|
|
cfs_rq->runnable_load_avg =
|
|
max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
|
|
cfs_rq->runnable_load_sum =
|
|
@@ -3207,13 +3433,25 @@ static inline u64 cfs_rq_last_update_tim
|
|
#endif
|
|
|
|
/*
|
|
+ * Synchronize entity load avg of dequeued entity without locking
|
|
+ * the previous rq.
|
|
+ */
|
|
+void sync_entity_load_avg(struct sched_entity *se)
|
|
+{
|
|
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
+ u64 last_update_time;
|
|
+
|
|
+ last_update_time = cfs_rq_last_update_time(cfs_rq);
|
|
+ __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
|
|
+}
|
|
+
|
|
+/*
|
|
* Task first catches up with cfs_rq, and then subtract
|
|
* itself from the cfs_rq (task must be off the queue now).
|
|
*/
|
|
void remove_entity_load_avg(struct sched_entity *se)
|
|
{
|
|
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
- u64 last_update_time;
|
|
|
|
/*
|
|
* tasks cannot exit without having gone through wake_up_new_task() ->
|
|
@@ -3225,9 +3463,7 @@ void remove_entity_load_avg(struct sched
|
|
* calls this.
|
|
*/
|
|
|
|
- last_update_time = cfs_rq_last_update_time(cfs_rq);
|
|
-
|
|
- __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
|
|
+ sync_entity_load_avg(se);
|
|
atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
|
|
atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
|
|
}
|
|
@@ -3252,7 +3488,10 @@ update_cfs_rq_load_avg(u64 now, struct c
|
|
return 0;
|
|
}
|
|
|
|
-static inline void update_load_avg(struct sched_entity *se, int not_used)
|
|
+#define UPDATE_TG 0x0
|
|
+#define SKIP_AGE_LOAD 0x0
|
|
+
|
|
+static inline void update_load_avg(struct sched_entity *se, int not_used1)
|
|
{
|
|
cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
|
|
}
|
|
@@ -3397,9 +3636,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
|
|
if (renorm && !curr)
|
|
se->vruntime += cfs_rq->min_vruntime;
|
|
|
|
+ /*
|
|
+ * When enqueuing a sched_entity, we must:
|
|
+ * - Update loads to have both entity and cfs_rq synced with now.
|
|
+ * - Add its load to cfs_rq->runnable_avg
|
|
+ * - For group_entity, update its weight to reflect the new share of
|
|
+ * its group cfs_rq
|
|
+ * - Add its new weight to cfs_rq->load.weight
|
|
+ */
|
|
+ update_load_avg(se, UPDATE_TG);
|
|
enqueue_entity_load_avg(cfs_rq, se);
|
|
+ update_cfs_shares(se);
|
|
account_entity_enqueue(cfs_rq, se);
|
|
- update_cfs_shares(cfs_rq);
|
|
|
|
if (flags & ENQUEUE_WAKEUP)
|
|
place_entity(cfs_rq, se, 0);
|
|
@@ -3471,6 +3719,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
|
|
* Update run-time statistics of the 'current'.
|
|
*/
|
|
update_curr(cfs_rq);
|
|
+
|
|
+ /*
|
|
+ * When dequeuing a sched_entity, we must:
|
|
+ * - Update loads to have both entity and cfs_rq synced with now.
|
|
+ * - Substract its load from the cfs_rq->runnable_avg.
|
|
+ * - Substract its previous weight from cfs_rq->load.weight.
|
|
+ * - For group entity, update its weight to reflect the new share
|
|
+ * of its group cfs_rq.
|
|
+ */
|
|
+ update_load_avg(se, UPDATE_TG);
|
|
dequeue_entity_load_avg(cfs_rq, se);
|
|
|
|
update_stats_dequeue(cfs_rq, se, flags);
|
|
@@ -3494,7 +3752,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
|
|
/* return excess runtime on last dequeue */
|
|
return_cfs_rq_runtime(cfs_rq);
|
|
|
|
- update_cfs_shares(cfs_rq);
|
|
+ update_cfs_shares(se);
|
|
|
|
/*
|
|
* Now advance min_vruntime if @se was the entity holding it back,
|
|
@@ -3558,7 +3816,7 @@ set_next_entity(struct cfs_rq *cfs_rq, s
|
|
*/
|
|
update_stats_wait_end(cfs_rq, se);
|
|
__dequeue_entity(cfs_rq, se);
|
|
- update_load_avg(se, 1);
|
|
+ update_load_avg(se, UPDATE_TG);
|
|
}
|
|
|
|
update_stats_curr_start(cfs_rq, se);
|
|
@@ -3676,8 +3934,8 @@ entity_tick(struct cfs_rq *cfs_rq, struc
|
|
/*
|
|
* Ensure that runnable average is periodically updated.
|
|
*/
|
|
- update_load_avg(curr, 1);
|
|
- update_cfs_shares(cfs_rq);
|
|
+ update_load_avg(curr, UPDATE_TG);
|
|
+ update_cfs_shares(curr);
|
|
|
|
#ifdef CONFIG_SCHED_HRTICK
|
|
/*
|
|
@@ -4528,6 +4786,14 @@ static inline void hrtick_update(struct
|
|
}
|
|
#endif
|
|
|
|
+#ifdef CONFIG_SMP
|
|
+static bool __cpu_overutilized(int cpu, int delta);
|
|
+static bool cpu_overutilized(int cpu);
|
|
+unsigned long boosted_cpu_util(int cpu);
|
|
+#else
|
|
+#define boosted_cpu_util(cpu) cpu_util_freq(cpu)
|
|
+#endif
|
|
+
|
|
/*
|
|
* The enqueue_task method is called before nr_running is
|
|
* increased. Here we update the fair scheduling stats and
|
|
@@ -4538,6 +4804,9 @@ enqueue_task_fair(struct rq *rq, struct
|
|
{
|
|
struct cfs_rq *cfs_rq;
|
|
struct sched_entity *se = &p->se;
|
|
+#ifdef CONFIG_SMP
|
|
+ int task_new = flags & ENQUEUE_WAKEUP_NEW;
|
|
+#endif
|
|
|
|
/*
|
|
* If in_iowait is set, the code below may not trigger any cpufreq
|
|
@@ -4562,6 +4831,7 @@ enqueue_task_fair(struct rq *rq, struct
|
|
if (cfs_rq_throttled(cfs_rq))
|
|
break;
|
|
cfs_rq->h_nr_running++;
|
|
+ walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
|
|
|
|
flags = ENQUEUE_WAKEUP;
|
|
}
|
|
@@ -4569,17 +4839,49 @@ enqueue_task_fair(struct rq *rq, struct
|
|
for_each_sched_entity(se) {
|
|
cfs_rq = cfs_rq_of(se);
|
|
cfs_rq->h_nr_running++;
|
|
+ walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
|
|
|
|
if (cfs_rq_throttled(cfs_rq))
|
|
break;
|
|
|
|
- update_load_avg(se, 1);
|
|
- update_cfs_shares(cfs_rq);
|
|
+ update_load_avg(se, UPDATE_TG);
|
|
+ update_cfs_shares(se);
|
|
}
|
|
|
|
if (!se)
|
|
add_nr_running(rq, 1);
|
|
|
|
+#ifdef CONFIG_SMP
|
|
+
|
|
+ /*
|
|
+ * Update SchedTune accounting.
|
|
+ *
|
|
+ * We do it before updating the CPU capacity to ensure the
|
|
+ * boost value of the current task is accounted for in the
|
|
+ * selection of the OPP.
|
|
+ *
|
|
+ * We do it also in the case where we enqueue a throttled task;
|
|
+ * we could argue that a throttled task should not boost a CPU,
|
|
+ * however:
|
|
+ * a) properly implementing CPU boosting considering throttled
|
|
+ * tasks will increase a lot the complexity of the solution
|
|
+ * b) it's not easy to quantify the benefits introduced by
|
|
+ * such a more complex solution.
|
|
+ * Thus, for the time being we go for the simple solution and boost
|
|
+ * also for throttled RQs.
|
|
+ */
|
|
+ schedtune_enqueue_task(p, cpu_of(rq));
|
|
+
|
|
+ if (!se) {
|
|
+ walt_inc_cumulative_runnable_avg(rq, p);
|
|
+ if (!task_new && !rq->rd->overutilized &&
|
|
+ cpu_overutilized(rq->cpu)) {
|
|
+ rq->rd->overutilized = true;
|
|
+ trace_sched_overutilized(true);
|
|
+ }
|
|
+ }
|
|
+
|
|
+#endif /* CONFIG_SMP */
|
|
hrtick_update(rq);
|
|
}
|
|
|
|
@@ -4609,6 +4911,7 @@ static void dequeue_task_fair(struct rq
|
|
if (cfs_rq_throttled(cfs_rq))
|
|
break;
|
|
cfs_rq->h_nr_running--;
|
|
+ walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
|
|
|
|
/* Don't dequeue parent if it has other entities besides us */
|
|
if (cfs_rq->load.weight) {
|
|
@@ -4628,17 +4931,33 @@ static void dequeue_task_fair(struct rq
|
|
for_each_sched_entity(se) {
|
|
cfs_rq = cfs_rq_of(se);
|
|
cfs_rq->h_nr_running--;
|
|
+ walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
|
|
|
|
if (cfs_rq_throttled(cfs_rq))
|
|
break;
|
|
|
|
- update_load_avg(se, 1);
|
|
- update_cfs_shares(cfs_rq);
|
|
+ update_load_avg(se, UPDATE_TG);
|
|
+ update_cfs_shares(se);
|
|
}
|
|
|
|
if (!se)
|
|
sub_nr_running(rq, 1);
|
|
|
|
+#ifdef CONFIG_SMP
|
|
+
|
|
+ /*
|
|
+ * Update SchedTune accounting
|
|
+ *
|
|
+ * We do it before updating the CPU capacity to ensure the
|
|
+ * boost value of the current task is accounted for in the
|
|
+ * selection of the OPP.
|
|
+ */
|
|
+ schedtune_dequeue_task(p, cpu_of(rq));
|
|
+
|
|
+ if (!se)
|
|
+ walt_dec_cumulative_runnable_avg(rq, p);
|
|
+#endif /* CONFIG_SMP */
|
|
+
|
|
hrtick_update(rq);
|
|
}
|
|
|
|
@@ -4945,15 +5264,6 @@ static unsigned long target_load(int cpu
|
|
return max(rq->cpu_load[type-1], total);
|
|
}
|
|
|
|
-static unsigned long capacity_of(int cpu)
|
|
-{
|
|
- return cpu_rq(cpu)->cpu_capacity;
|
|
-}
|
|
-
|
|
-static unsigned long capacity_orig_of(int cpu)
|
|
-{
|
|
- return cpu_rq(cpu)->cpu_capacity_orig;
|
|
-}
|
|
|
|
static unsigned long cpu_avg_load_per_task(int cpu)
|
|
{
|
|
@@ -5105,6 +5415,532 @@ static void record_wakee(struct task_str
|
|
}
|
|
|
|
/*
|
|
+ * Returns the current capacity of cpu after applying both
|
|
+ * cpu and freq scaling.
|
|
+ */
|
|
+unsigned long capacity_curr_of(int cpu)
|
|
+{
|
|
+ return cpu_rq(cpu)->cpu_capacity_orig *
|
|
+ arch_scale_freq_capacity(NULL, cpu)
|
|
+ >> SCHED_CAPACITY_SHIFT;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Returns the current capacity of cpu after applying both
|
|
+ * cpu and min freq scaling.
|
|
+ */
|
|
+unsigned long capacity_min_of(int cpu)
|
|
+{
|
|
+ if (!sched_feat(MIN_CAPACITY_CAPPING))
|
|
+ return 0;
|
|
+ return arch_scale_cpu_capacity(NULL, cpu) *
|
|
+ arch_scale_min_freq_capacity(NULL, cpu)
|
|
+ >> SCHED_CAPACITY_SHIFT;
|
|
+}
|
|
+
|
|
+
|
|
+static inline bool energy_aware(void)
|
|
+{
|
|
+ return sched_feat(ENERGY_AWARE);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * CPU candidates.
|
|
+ *
|
|
+ * These are labels to reference CPU candidates for an energy_diff.
|
|
+ * Currently we support only two possible candidates: the task's previous CPU
|
|
+ * and another candiate CPU.
|
|
+ * More advanced/aggressive EAS selection policies can consider more
|
|
+ * candidates.
|
|
+ */
|
|
+#define EAS_CPU_PRV 0
|
|
+#define EAS_CPU_NXT 1
|
|
+#define EAS_CPU_BKP 2
|
|
+#define EAS_CPU_CNT 3
|
|
+
|
|
+/*
|
|
+ * energy_diff - supports the computation of the estimated energy impact in
|
|
+ * moving a "task"'s "util_delta" between different CPU candidates.
|
|
+ */
|
|
+struct energy_env {
|
|
+ /* Utilization to move */
|
|
+ struct task_struct *p;
|
|
+ int util_delta;
|
|
+
|
|
+ /* Mask of CPUs candidates to evaluate */
|
|
+ cpumask_t cpus_mask;
|
|
+
|
|
+ /* CPU candidates to evaluate */
|
|
+ struct {
|
|
+
|
|
+ /* CPU ID, must be in cpus_mask */
|
|
+ int cpu_id;
|
|
+
|
|
+ /*
|
|
+ * Index (into sched_group_energy::cap_states) of the OPP the
|
|
+ * CPU needs to run at if the task is placed on it.
|
|
+ * This includes the both active and blocked load, due to
|
|
+ * other tasks on this CPU, as well as the task's own
|
|
+ * utilization.
|
|
+ */
|
|
+ int cap_idx;
|
|
+ int cap;
|
|
+
|
|
+ /* Estimated system energy */
|
|
+ unsigned int energy;
|
|
+
|
|
+ /* Estimated energy variation wrt EAS_CPU_PRV */
|
|
+ int nrg_delta;
|
|
+
|
|
+ } cpu[EAS_CPU_CNT];
|
|
+
|
|
+ /*
|
|
+ * Index (into energy_env::cpu) of the morst energy efficient CPU for
|
|
+ * the specified energy_env::task
|
|
+ */
|
|
+ int next_idx;
|
|
+
|
|
+ /* Support data */
|
|
+ struct sched_group *sg_top;
|
|
+ struct sched_group *sg_cap;
|
|
+ struct sched_group *sg;
|
|
+};
|
|
+
|
|
+static int cpu_util_wake(int cpu, struct task_struct *p);
|
|
+
|
|
+/*
|
|
+ * __cpu_norm_util() returns the cpu util relative to a specific capacity,
|
|
+ * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE], which is useful for
|
|
+ * energy calculations.
|
|
+ *
|
|
+ * Since util is a scale-invariant utilization defined as:
|
|
+ *
|
|
+ * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
|
|
+ *
|
|
+ * the normalized util can be found using the specific capacity.
|
|
+ *
|
|
+ * capacity = capacity_orig * curr_freq/max_freq
|
|
+ *
|
|
+ * norm_util = running_time/time ~ util/capacity
|
|
+ */
|
|
+static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity)
|
|
+{
|
|
+ if (util >= capacity)
|
|
+ return SCHED_CAPACITY_SCALE;
|
|
+
|
|
+ return (util << SCHED_CAPACITY_SHIFT)/capacity;
|
|
+}
|
|
+
|
|
+static unsigned long group_max_util(struct energy_env *eenv, int cpu_idx)
|
|
+{
|
|
+ unsigned long max_util = 0;
|
|
+ unsigned long util;
|
|
+ int cpu;
|
|
+
|
|
+ for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) {
|
|
+ util = cpu_util_wake(cpu, eenv->p);
|
|
+
|
|
+ /*
|
|
+ * If we are looking at the target CPU specified by the eenv,
|
|
+ * then we should add the (estimated) utilization of the task
|
|
+ * assuming we will wake it up on that CPU.
|
|
+ */
|
|
+ if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id))
|
|
+ util += eenv->util_delta;
|
|
+
|
|
+ max_util = max(max_util, util);
|
|
+
|
|
+ /*
|
|
+ * Take into account any minimum frequency imposed
|
|
+ * elsewhere which limits the energy states available
|
|
+ * If the MIN_CAPACITY_CAPPING feature is not enabled
|
|
+ * capacity_min_of will return 0 (not capped).
|
|
+ */
|
|
+ max_util = max(max_util, capacity_min_of(cpu));
|
|
+
|
|
+ }
|
|
+
|
|
+ return max_util;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * group_norm_util() returns the approximated group util relative to it's
|
|
+ * current capacity (busy ratio), in the range [0..SCHED_LOAD_SCALE], for use
|
|
+ * in energy calculations.
|
|
+ *
|
|
+ * Since task executions may or may not overlap in time in the group the true
|
|
+ * normalized util is between MAX(cpu_norm_util(i)) and SUM(cpu_norm_util(i))
|
|
+ * when iterating over all CPUs in the group.
|
|
+ * The latter estimate is used as it leads to a more pessimistic energy
|
|
+ * estimate (more busy).
|
|
+ */
|
|
+static unsigned
|
|
+long group_norm_util(struct energy_env *eenv, int cpu_idx)
|
|
+{
|
|
+ unsigned long capacity = eenv->cpu[cpu_idx].cap;
|
|
+ unsigned long util, util_sum = 0;
|
|
+ int cpu;
|
|
+
|
|
+ for_each_cpu(cpu, sched_group_cpus(eenv->sg)) {
|
|
+ util = cpu_util_wake(cpu, eenv->p);
|
|
+
|
|
+ /*
|
|
+ * If we are looking at the target CPU specified by the eenv,
|
|
+ * then we should add the (estimated) utilization of the task
|
|
+ * assuming we will wake it up on that CPU.
|
|
+ */
|
|
+ if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id))
|
|
+ util += eenv->util_delta;
|
|
+
|
|
+ util_sum += __cpu_norm_util(util, capacity);
|
|
+ }
|
|
+
|
|
+ return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE);
|
|
+}
|
|
+
|
|
+static int find_new_capacity(struct energy_env *eenv, int cpu_idx)
|
|
+{
|
|
+ const struct sched_group_energy *sge = eenv->sg->sge;
|
|
+ int idx, max_idx = sge->nr_cap_states - 1;
|
|
+ unsigned long util = group_max_util(eenv, cpu_idx);
|
|
+
|
|
+ /* default is max_cap if we don't find a match */
|
|
+ eenv->cpu[cpu_idx].cap_idx = max_idx;
|
|
+ eenv->cpu[cpu_idx].cap = sge->cap_states[max_idx].cap;
|
|
+
|
|
+ for (idx = 0; idx < sge->nr_cap_states; idx++) {
|
|
+ if (sge->cap_states[idx].cap >= util) {
|
|
+ /* Keep track of SG's capacity */
|
|
+ eenv->cpu[cpu_idx].cap_idx = idx;
|
|
+ eenv->cpu[cpu_idx].cap = sge->cap_states[idx].cap;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return eenv->cpu[cpu_idx].cap_idx;
|
|
+}
|
|
+
|
|
+static int group_idle_state(struct energy_env *eenv, int cpu_idx)
|
|
+{
|
|
+ struct sched_group *sg = eenv->sg;
|
|
+ int i, state = INT_MAX;
|
|
+ int src_in_grp, dst_in_grp;
|
|
+ long grp_util = 0;
|
|
+
|
|
+ /* Find the shallowest idle state in the sched group. */
|
|
+ for_each_cpu(i, sched_group_cpus(sg))
|
|
+ state = min(state, idle_get_state_idx(cpu_rq(i)));
|
|
+
|
|
+ /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
|
|
+ state++;
|
|
+
|
|
+ src_in_grp = cpumask_test_cpu(eenv->cpu[EAS_CPU_PRV].cpu_id,
|
|
+ sched_group_cpus(sg));
|
|
+ dst_in_grp = cpumask_test_cpu(eenv->cpu[cpu_idx].cpu_id,
|
|
+ sched_group_cpus(sg));
|
|
+ if (src_in_grp == dst_in_grp) {
|
|
+ /* both CPUs under consideration are in the same group or not in
|
|
+ * either group, migration should leave idle state the same.
|
|
+ */
|
|
+ goto end;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Try to estimate if a deeper idle state is
|
|
+ * achievable when we move the task.
|
|
+ */
|
|
+ for_each_cpu(i, sched_group_cpus(sg)) {
|
|
+ grp_util += cpu_util_wake(i, eenv->p);
|
|
+ if (unlikely(i == eenv->cpu[cpu_idx].cpu_id))
|
|
+ grp_util += eenv->util_delta;
|
|
+ }
|
|
+
|
|
+ if (grp_util <=
|
|
+ ((long)sg->sgc->max_capacity * (int)sg->group_weight)) {
|
|
+ /* after moving, this group is at most partly
|
|
+ * occupied, so it should have some idle time.
|
|
+ */
|
|
+ int max_idle_state_idx = sg->sge->nr_idle_states - 2;
|
|
+ int new_state = grp_util * max_idle_state_idx;
|
|
+ if (grp_util <= 0)
|
|
+ /* group will have no util, use lowest state */
|
|
+ new_state = max_idle_state_idx + 1;
|
|
+ else {
|
|
+ /* for partially idle, linearly map util to idle
|
|
+ * states, excluding the lowest one. This does not
|
|
+ * correspond to the state we expect to enter in
|
|
+ * reality, but an indication of what might happen.
|
|
+ */
|
|
+ new_state = min(max_idle_state_idx, (int)
|
|
+ (new_state / sg->sgc->max_capacity));
|
|
+ new_state = max_idle_state_idx - new_state;
|
|
+ }
|
|
+ state = new_state;
|
|
+ } else {
|
|
+ /* After moving, the group will be fully occupied
|
|
+ * so assume it will not be idle at all.
|
|
+ */
|
|
+ state = 0;
|
|
+ }
|
|
+end:
|
|
+ return state;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * calc_sg_energy: compute energy for the eenv's SG (i.e. eenv->sg).
|
|
+ *
|
|
+ * This works in iterations to compute the SG's energy for each CPU
|
|
+ * candidate defined by the energy_env's cpu array.
|
|
+ *
|
|
+ * NOTE: in the following computations for busy_energy and idle_energy we do
|
|
+ * not shift by SCHED_CAPACITY_SHIFT in order to reduce rounding errors.
|
|
+ * The required scaling will be performed just one time, by the calling
|
|
+ * functions, once we accumulated the contributons for all the SGs.
|
|
+ */
|
|
+static void calc_sg_energy(struct energy_env *eenv)
|
|
+{
|
|
+ struct sched_group *sg = eenv->sg;
|
|
+ int busy_energy, idle_energy;
|
|
+ unsigned int busy_power;
|
|
+ unsigned int idle_power;
|
|
+ unsigned long sg_util;
|
|
+ int cap_idx, idle_idx;
|
|
+ int total_energy = 0;
|
|
+ int cpu_idx;
|
|
+
|
|
+ for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
|
|
+
|
|
+
|
|
+ if (eenv->cpu[cpu_idx].cpu_id == -1)
|
|
+ continue;
|
|
+ /* Compute ACTIVE energy */
|
|
+ cap_idx = find_new_capacity(eenv, cpu_idx);
|
|
+ busy_power = sg->sge->cap_states[cap_idx].power;
|
|
+ /*
|
|
+ * in order to calculate cpu_norm_util, we need to know which
|
|
+ * capacity level the group will be at, so calculate that first
|
|
+ */
|
|
+ sg_util = group_norm_util(eenv, cpu_idx);
|
|
+
|
|
+ busy_energy = sg_util * busy_power;
|
|
+
|
|
+ /* Compute IDLE energy */
|
|
+ idle_idx = group_idle_state(eenv, cpu_idx);
|
|
+ idle_power = sg->sge->idle_states[idle_idx].power;
|
|
+
|
|
+ idle_energy = SCHED_CAPACITY_SCALE - sg_util;
|
|
+ idle_energy *= idle_power;
|
|
+
|
|
+ total_energy = busy_energy + idle_energy;
|
|
+ eenv->cpu[cpu_idx].energy += total_energy;
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * compute_energy() computes the absolute variation in energy consumption by
|
|
+ * moving eenv.util_delta from EAS_CPU_PRV to EAS_CPU_NXT.
|
|
+ *
|
|
+ * NOTE: compute_energy() may fail when racing with sched_domain updates, in
|
|
+ * which case we abort by returning -EINVAL.
|
|
+ */
|
|
+static int compute_energy(struct energy_env *eenv)
|
|
+{
|
|
+ struct cpumask visit_cpus;
|
|
+ int cpu_count;
|
|
+
|
|
+ WARN_ON(!eenv->sg_top->sge);
|
|
+
|
|
+ cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));
|
|
+ /* If a cpu is hotplugged in while we are in this function,
|
|
+ * it does not appear in the existing visit_cpus mask
|
|
+ * which came from the sched_group pointer of the
|
|
+ * sched_domain pointed at by sd_ea for either the prev
|
|
+ * or next cpu and was dereferenced in __energy_diff.
|
|
+ * Since we will dereference sd_scs later as we iterate
|
|
+ * through the CPUs we expect to visit, new CPUs can
|
|
+ * be present which are not in the visit_cpus mask.
|
|
+ * Guard this with cpu_count.
|
|
+ */
|
|
+ cpu_count = cpumask_weight(&visit_cpus);
|
|
+
|
|
+ while (!cpumask_empty(&visit_cpus)) {
|
|
+ struct sched_group *sg_shared_cap = NULL;
|
|
+ int cpu = cpumask_first(&visit_cpus);
|
|
+ struct sched_domain *sd;
|
|
+
|
|
+ /*
|
|
+ * Is the group utilization affected by cpus outside this
|
|
+ * sched_group?
|
|
+ * This sd may have groups with cpus which were not present
|
|
+ * when we took visit_cpus.
|
|
+ */
|
|
+ sd = rcu_dereference(per_cpu(sd_scs, cpu));
|
|
+ if (sd && sd->parent)
|
|
+ sg_shared_cap = sd->parent->groups;
|
|
+
|
|
+ for_each_domain(cpu, sd) {
|
|
+ struct sched_group *sg = sd->groups;
|
|
+
|
|
+ /* Has this sched_domain already been visited? */
|
|
+ if (sd->child && group_first_cpu(sg) != cpu)
|
|
+ break;
|
|
+
|
|
+ do {
|
|
+ eenv->sg_cap = sg;
|
|
+ if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
|
|
+ eenv->sg_cap = sg_shared_cap;
|
|
+
|
|
+ /*
|
|
+ * Compute the energy for all the candidate
|
|
+ * CPUs in the current visited SG.
|
|
+ */
|
|
+ eenv->sg = sg;
|
|
+ calc_sg_energy(eenv);
|
|
+
|
|
+ if (!sd->child) {
|
|
+ /*
|
|
+ * cpu_count here is the number of
|
|
+ * cpus we expect to visit in this
|
|
+ * calculation. If we race against
|
|
+ * hotplug, we can have extra cpus
|
|
+ * added to the groups we are
|
|
+ * iterating which do not appear in
|
|
+ * the visit_cpus mask. In that case
|
|
+ * we are not able to calculate energy
|
|
+ * without restarting so we will bail
|
|
+ * out and use prev_cpu this time.
|
|
+ */
|
|
+ if (!cpu_count)
|
|
+ return -EINVAL;
|
|
+ cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
|
|
+ cpu_count--;
|
|
+ }
|
|
+
|
|
+ if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
|
|
+ goto next_cpu;
|
|
+
|
|
+ } while (sg = sg->next, sg != sd->groups);
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * If we raced with hotplug and got an sd NULL-pointer;
|
|
+ * returning a wrong energy estimation is better than
|
|
+ * entering an infinite loop.
|
|
+ * Specifically: If a cpu is unplugged after we took
|
|
+ * the visit_cpus mask, it no longer has an sd_scs
|
|
+ * pointer, so when we dereference it, we get NULL.
|
|
+ */
|
|
+ if (cpumask_test_cpu(cpu, &visit_cpus))
|
|
+ return -EINVAL;
|
|
+next_cpu:
|
|
+ cpumask_clear_cpu(cpu, &visit_cpus);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
|
|
+{
|
|
+ return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
|
|
+}
|
|
+
|
|
+/*
|
|
+ * select_energy_cpu_idx(): estimate the energy impact of changing the
|
|
+ * utilization distribution.
|
|
+ *
|
|
+ * The eenv parameter specifies the changes: utilisation amount and a pair of
|
|
+ * possible CPU candidates (the previous CPU and a different target CPU).
|
|
+ *
|
|
+ * This function returns the index of a CPU candidate specified by the
|
|
+ * energy_env which corresponds to the first CPU saving energy.
|
|
+ * Thus, 0 (EAS_CPU_PRV) means that non of the CPU candidate is more energy
|
|
+ * efficient than running on prev_cpu. This is also the value returned in case
|
|
+ * of abort due to error conditions during the computations.
|
|
+ * A value greater than zero means that the first energy-efficient CPU is the
|
|
+ * one represented by eenv->cpu[eenv->next_idx].cpu_id.
|
|
+ */
|
|
+static inline int select_energy_cpu_idx(struct energy_env *eenv)
|
|
+{
|
|
+ struct sched_domain *sd;
|
|
+ struct sched_group *sg;
|
|
+ int sd_cpu = -1;
|
|
+ int cpu_idx;
|
|
+ int margin;
|
|
+
|
|
+ sd_cpu = eenv->cpu[EAS_CPU_PRV].cpu_id;
|
|
+ sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));
|
|
+ if (!sd)
|
|
+ return EAS_CPU_PRV;
|
|
+
|
|
+ cpumask_clear(&eenv->cpus_mask);
|
|
+ for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
|
|
+ int cpu = eenv->cpu[cpu_idx].cpu_id;
|
|
+
|
|
+ if (cpu < 0)
|
|
+ continue;
|
|
+ cpumask_set_cpu(cpu, &eenv->cpus_mask);
|
|
+ }
|
|
+
|
|
+ sg = sd->groups;
|
|
+ do {
|
|
+ /* Skip SGs which do not contains a candidate CPU */
|
|
+ if (!cpumask_intersects(&eenv->cpus_mask, sched_group_cpus(sg)))
|
|
+ continue;
|
|
+
|
|
+ eenv->sg_top = sg;
|
|
+ /* energy is unscaled to reduce rounding errors */
|
|
+ if (compute_energy(eenv) == -EINVAL)
|
|
+ return EAS_CPU_PRV;
|
|
+
|
|
+ } while (sg = sg->next, sg != sd->groups);
|
|
+
|
|
+ /* Scale energy before comparisons */
|
|
+ for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx)
|
|
+ eenv->cpu[cpu_idx].energy >>= SCHED_CAPACITY_SHIFT;
|
|
+
|
|
+ /*
|
|
+ * Compute the dead-zone margin used to prevent too many task
|
|
+ * migrations with negligible energy savings.
|
|
+ * An energy saving is considered meaningful if it reduces the energy
|
|
+ * consumption of EAS_CPU_PRV CPU candidate by at least ~1.56%
|
|
+ */
|
|
+ margin = eenv->cpu[EAS_CPU_PRV].energy >> 6;
|
|
+
|
|
+ /*
|
|
+ * By default the EAS_CPU_PRV CPU is considered the most energy
|
|
+ * efficient, with a 0 energy variation.
|
|
+ */
|
|
+ eenv->next_idx = EAS_CPU_PRV;
|
|
+
|
|
+ /*
|
|
+ * Compare the other CPU candidates to find a CPU which can be
|
|
+ * more energy efficient then EAS_CPU_PRV
|
|
+ */
|
|
+ for (cpu_idx = EAS_CPU_NXT; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
|
|
+ /* Skip not valid scheduled candidates */
|
|
+ if (eenv->cpu[cpu_idx].cpu_id < 0)
|
|
+ continue;
|
|
+ /* Compute energy delta wrt EAS_CPU_PRV */
|
|
+ eenv->cpu[cpu_idx].nrg_delta =
|
|
+ eenv->cpu[cpu_idx].energy -
|
|
+ eenv->cpu[EAS_CPU_PRV].energy;
|
|
+ /* filter energy variations within the dead-zone margin */
|
|
+ if (abs(eenv->cpu[cpu_idx].nrg_delta) < margin)
|
|
+ eenv->cpu[cpu_idx].nrg_delta = 0;
|
|
+ /* update the schedule candidate with min(nrg_delta) */
|
|
+ if (eenv->cpu[cpu_idx].nrg_delta <
|
|
+ eenv->cpu[eenv->next_idx].nrg_delta) {
|
|
+ eenv->next_idx = cpu_idx;
|
|
+ if (sched_feat(FBT_STRICT_ORDER))
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return eenv->next_idx;
|
|
+}
|
|
+
|
|
+/*
|
|
* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
|
|
*
|
|
* A waker of many should wake a different task than the one last awakened
|
|
@@ -5200,24 +6036,180 @@ static int wake_affine(struct sched_doma
|
|
return 1;
|
|
}
|
|
|
|
+static inline unsigned long task_util(struct task_struct *p)
|
|
+{
|
|
+#ifdef CONFIG_SCHED_WALT
|
|
+ if (!walt_disabled && sysctl_sched_use_walt_task_util) {
|
|
+ unsigned long demand = p->ravg.demand;
|
|
+ return (demand << SCHED_CAPACITY_SHIFT) / walt_ravg_window;
|
|
+ }
|
|
+#endif
|
|
+ return p->se.avg.util_avg;
|
|
+}
|
|
+
|
|
+static inline unsigned long boosted_task_util(struct task_struct *p);
|
|
+
|
|
+static inline bool __task_fits(struct task_struct *p, int cpu, int util)
|
|
+{
|
|
+ unsigned long capacity = capacity_of(cpu);
|
|
+
|
|
+ util += boosted_task_util(p);
|
|
+
|
|
+ return (capacity * 1024) > (util * capacity_margin);
|
|
+}
|
|
+
|
|
+static inline bool task_fits_max(struct task_struct *p, int cpu)
|
|
+{
|
|
+ unsigned long capacity = capacity_of(cpu);
|
|
+ unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val;
|
|
+
|
|
+ if (capacity == max_capacity)
|
|
+ return true;
|
|
+
|
|
+ if (capacity * capacity_margin > max_capacity * 1024)
|
|
+ return true;
|
|
+
|
|
+ return __task_fits(p, cpu, 0);
|
|
+}
|
|
+
|
|
+static bool __cpu_overutilized(int cpu, int delta)
|
|
+{
|
|
+ return (capacity_of(cpu) * 1024) < ((cpu_util(cpu) + delta) * capacity_margin);
|
|
+}
|
|
+
|
|
+static bool cpu_overutilized(int cpu)
|
|
+{
|
|
+ return __cpu_overutilized(cpu, 0);
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_SCHED_TUNE
|
|
+
|
|
+struct reciprocal_value schedtune_spc_rdiv;
|
|
+
|
|
+static long
|
|
+schedtune_margin(unsigned long signal, long boost)
|
|
+{
|
|
+ long long margin = 0;
|
|
+
|
|
+ /*
|
|
+ * Signal proportional compensation (SPC)
|
|
+ *
|
|
+ * The Boost (B) value is used to compute a Margin (M) which is
|
|
+ * proportional to the complement of the original Signal (S):
|
|
+ * M = B * (SCHED_CAPACITY_SCALE - S)
|
|
+ * The obtained M could be used by the caller to "boost" S.
|
|
+ */
|
|
+ if (boost >= 0) {
|
|
+ margin = SCHED_CAPACITY_SCALE - signal;
|
|
+ margin *= boost;
|
|
+ } else {
|
|
+ margin = -signal * boost;
|
|
+ }
|
|
+
|
|
+ margin = reciprocal_divide(margin, schedtune_spc_rdiv);
|
|
+ if (boost < 0)
|
|
+ margin *= -1;
|
|
+
|
|
+ return margin;
|
|
+}
|
|
+
|
|
+static inline int
|
|
+schedtune_cpu_margin(unsigned long util, int cpu)
|
|
+{
|
|
+ int boost = schedtune_cpu_boost(cpu);
|
|
+
|
|
+ if (boost == 0)
|
|
+ return 0;
|
|
+
|
|
+ return schedtune_margin(util, boost);
|
|
+}
|
|
+
|
|
+static inline long
|
|
+schedtune_task_margin(struct task_struct *p)
|
|
+{
|
|
+ int boost = schedtune_task_boost(p);
|
|
+ unsigned long util;
|
|
+ long margin;
|
|
+
|
|
+ if (boost == 0)
|
|
+ return 0;
|
|
+
|
|
+ util = task_util(p);
|
|
+ margin = schedtune_margin(util, boost);
|
|
+
|
|
+ return margin;
|
|
+}
|
|
+
|
|
+#else /* CONFIG_SCHED_TUNE */
|
|
+
|
|
+static inline int
|
|
+schedtune_cpu_margin(unsigned long util, int cpu)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static inline int
|
|
+schedtune_task_margin(struct task_struct *p)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+#endif /* CONFIG_SCHED_TUNE */
|
|
+
|
|
+unsigned long
|
|
+boosted_cpu_util(int cpu)
|
|
+{
|
|
+ unsigned long util = cpu_util_freq(cpu);
|
|
+ long margin = schedtune_cpu_margin(util, cpu);
|
|
+
|
|
+ trace_sched_boost_cpu(cpu, util, margin);
|
|
+
|
|
+ return util + margin;
|
|
+}
|
|
+
|
|
+static inline unsigned long
|
|
+boosted_task_util(struct task_struct *p)
|
|
+{
|
|
+ unsigned long util = task_util(p);
|
|
+ long margin = schedtune_task_margin(p);
|
|
+
|
|
+ trace_sched_boost_task(p, util, margin);
|
|
+
|
|
+ return util + margin;
|
|
+}
|
|
+
|
|
+static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
|
|
+{
|
|
+ return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
|
|
+}
|
|
+
|
|
/*
|
|
* find_idlest_group finds and returns the least busy CPU group within the
|
|
* domain.
|
|
+ *
|
|
+ * Assumes p is allowed on at least one CPU in sd.
|
|
*/
|
|
static struct sched_group *
|
|
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
|
|
int this_cpu, int sd_flag)
|
|
{
|
|
struct sched_group *idlest = NULL, *group = sd->groups;
|
|
- unsigned long min_load = ULONG_MAX, this_load = 0;
|
|
+ struct sched_group *most_spare_sg = NULL;
|
|
+ unsigned long min_runnable_load = ULONG_MAX;
|
|
+ unsigned long this_runnable_load = ULONG_MAX;
|
|
+ unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
|
|
+ unsigned long most_spare = 0, this_spare = 0;
|
|
int load_idx = sd->forkexec_idx;
|
|
- int imbalance = 100 + (sd->imbalance_pct-100)/2;
|
|
+ int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
|
|
+ unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
|
|
+ (sd->imbalance_pct-100) / 100;
|
|
|
|
if (sd_flag & SD_BALANCE_WAKE)
|
|
load_idx = sd->wake_idx;
|
|
|
|
do {
|
|
- unsigned long load, avg_load;
|
|
+ unsigned long load, avg_load, runnable_load;
|
|
+ unsigned long spare_cap, max_spare_cap;
|
|
int local_group;
|
|
int i;
|
|
|
|
@@ -5229,8 +6221,13 @@ find_idlest_group(struct sched_domain *s
|
|
local_group = cpumask_test_cpu(this_cpu,
|
|
sched_group_cpus(group));
|
|
|
|
- /* Tally up the load of all CPUs in the group */
|
|
+ /*
|
|
+ * Tally up the load of all CPUs in the group and find
|
|
+ * the group containing the CPU with most spare capacity.
|
|
+ */
|
|
avg_load = 0;
|
|
+ runnable_load = 0;
|
|
+ max_spare_cap = 0;
|
|
|
|
for_each_cpu(i, sched_group_cpus(group)) {
|
|
/* Bias balancing toward cpus of our domain */
|
|
@@ -5239,30 +6236,85 @@ find_idlest_group(struct sched_domain *s
|
|
else
|
|
load = target_load(i, load_idx);
|
|
|
|
- avg_load += load;
|
|
+ runnable_load += load;
|
|
+
|
|
+ avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
|
|
+
|
|
+ spare_cap = capacity_spare_wake(i, p);
|
|
+
|
|
+ if (spare_cap > max_spare_cap)
|
|
+ max_spare_cap = spare_cap;
|
|
}
|
|
|
|
/* Adjust by relative CPU capacity of the group */
|
|
- avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
|
|
+ avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
|
|
+ group->sgc->capacity;
|
|
+ runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
|
|
+ group->sgc->capacity;
|
|
|
|
if (local_group) {
|
|
- this_load = avg_load;
|
|
- } else if (avg_load < min_load) {
|
|
- min_load = avg_load;
|
|
- idlest = group;
|
|
+ this_runnable_load = runnable_load;
|
|
+ this_avg_load = avg_load;
|
|
+ this_spare = max_spare_cap;
|
|
+ } else {
|
|
+ if (min_runnable_load > (runnable_load + imbalance)) {
|
|
+ /*
|
|
+ * The runnable load is significantly smaller
|
|
+ * so we can pick this new cpu
|
|
+ */
|
|
+ min_runnable_load = runnable_load;
|
|
+ min_avg_load = avg_load;
|
|
+ idlest = group;
|
|
+ } else if ((runnable_load < (min_runnable_load + imbalance)) &&
|
|
+ (100*min_avg_load > imbalance_scale*avg_load)) {
|
|
+ /*
|
|
+ * The runnable loads are close so we take
|
|
+ * into account blocked load through avg_load
|
|
+ * which is blocked + runnable load
|
|
+ */
|
|
+ min_avg_load = avg_load;
|
|
+ idlest = group;
|
|
+ }
|
|
+
|
|
+ if (most_spare < max_spare_cap) {
|
|
+ most_spare = max_spare_cap;
|
|
+ most_spare_sg = group;
|
|
+ }
|
|
}
|
|
} while (group = group->next, group != sd->groups);
|
|
|
|
- if (!idlest || 100*this_load < imbalance*min_load)
|
|
+ /*
|
|
+ * The cross-over point between using spare capacity or least load
|
|
+ * is too conservative for high utilization tasks on partially
|
|
+ * utilized systems if we require spare_capacity > task_util(p),
|
|
+ * so we allow for some task stuffing by using
|
|
+ * spare_capacity > task_util(p)/2.
|
|
+ * spare capacity can't be used for fork because the utilization has
|
|
+ * not been set yet as it need to get a rq to init the utilization
|
|
+ */
|
|
+ if (sd_flag & SD_BALANCE_FORK)
|
|
+ goto skip_spare;
|
|
+
|
|
+ if (this_spare > task_util(p) / 2 &&
|
|
+ imbalance_scale*this_spare > 100*most_spare)
|
|
+ return NULL;
|
|
+ else if (most_spare > task_util(p) / 2)
|
|
+ return most_spare_sg;
|
|
+
|
|
+skip_spare:
|
|
+ if (!idlest ||
|
|
+ (min_runnable_load > (this_runnable_load + imbalance)) ||
|
|
+ ((this_runnable_load < (min_runnable_load + imbalance)) &&
|
|
+ (100*this_avg_load < imbalance_scale*min_avg_load)))
|
|
return NULL;
|
|
return idlest;
|
|
}
|
|
|
|
/*
|
|
- * find_idlest_cpu - find the idlest cpu among the cpus in group.
|
|
+ * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
|
|
*/
|
|
static int
|
|
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
|
|
+find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
|
|
{
|
|
unsigned long load, min_load = ULONG_MAX;
|
|
unsigned int min_exit_latency = UINT_MAX;
|
|
@@ -5311,6 +6363,68 @@ find_idlest_cpu(struct sched_group *grou
|
|
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
|
|
}
|
|
|
|
+static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
|
|
+ int cpu, int prev_cpu, int sd_flag)
|
|
+{
|
|
+ int wu = sd_flag & SD_BALANCE_WAKE;
|
|
+ int cas_cpu = -1;
|
|
+ int new_cpu = cpu;
|
|
+
|
|
+ if (wu) {
|
|
+ schedstat_inc(p->se.statistics.nr_wakeups_cas_attempts);
|
|
+ schedstat_inc(this_rq()->eas_stats.cas_attempts);
|
|
+ }
|
|
+
|
|
+ if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
|
|
+ return prev_cpu;
|
|
+
|
|
+ while (sd) {
|
|
+ struct sched_group *group;
|
|
+ struct sched_domain *tmp;
|
|
+ int weight;
|
|
+
|
|
+ if (wu)
|
|
+ schedstat_inc(sd->eas_stats.cas_attempts);
|
|
+
|
|
+ if (!(sd->flags & sd_flag)) {
|
|
+ sd = sd->child;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ group = find_idlest_group(sd, p, cpu, sd_flag);
|
|
+ if (!group) {
|
|
+ sd = sd->child;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ new_cpu = find_idlest_group_cpu(group, p, cpu);
|
|
+ if (new_cpu == cpu) {
|
|
+ /* Now try balancing at a lower domain level of cpu */
|
|
+ sd = sd->child;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ /* Now try balancing at a lower domain level of new_cpu */
|
|
+ cpu = cas_cpu = new_cpu;
|
|
+ weight = sd->span_weight;
|
|
+ sd = NULL;
|
|
+ for_each_domain(cpu, tmp) {
|
|
+ if (weight <= tmp->span_weight)
|
|
+ break;
|
|
+ if (tmp->flags & sd_flag)
|
|
+ sd = tmp;
|
|
+ }
|
|
+ /* while loop will break here if sd == NULL */
|
|
+ }
|
|
+
|
|
+ if (wu && (cas_cpu >= 0)) {
|
|
+ schedstat_inc(p->se.statistics.nr_wakeups_cas_count);
|
|
+ schedstat_inc(this_rq()->eas_stats.cas_count);
|
|
+ }
|
|
+
|
|
+ return new_cpu;
|
|
+}
|
|
+
|
|
#ifdef CONFIG_SCHED_SMT
|
|
|
|
static inline void set_idle_cores(int cpu, int val)
|
|
@@ -5478,96 +6592,583 @@ static int select_idle_cpu(struct task_s
|
|
static int select_idle_sibling(struct task_struct *p, int prev, int target)
|
|
{
|
|
struct sched_domain *sd;
|
|
- int i;
|
|
+ struct sched_group *sg;
|
|
+ int i = task_cpu(p);
|
|
+ int best_idle_cpu = -1;
|
|
+ int best_idle_cstate = INT_MAX;
|
|
+ unsigned long best_idle_capacity = ULONG_MAX;
|
|
|
|
- if (idle_cpu(target))
|
|
- return target;
|
|
+ schedstat_inc(p->se.statistics.nr_wakeups_sis_attempts);
|
|
+ schedstat_inc(this_rq()->eas_stats.sis_attempts);
|
|
+
|
|
+ if (!sysctl_sched_cstate_aware) {
|
|
+ if (idle_cpu(target)) {
|
|
+ schedstat_inc(p->se.statistics.nr_wakeups_sis_idle);
|
|
+ schedstat_inc(this_rq()->eas_stats.sis_idle);
|
|
+ return target;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * If the prevous cpu is cache affine and idle, don't be stupid.
|
|
+ */
|
|
+ if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) {
|
|
+ schedstat_inc(p->se.statistics.nr_wakeups_sis_cache_affine);
|
|
+ schedstat_inc(this_rq()->eas_stats.sis_cache_affine);
|
|
+ return i;
|
|
+ }
|
|
+
|
|
+ sd = rcu_dereference(per_cpu(sd_llc, target));
|
|
+ if (!sd)
|
|
+ return target;
|
|
+
|
|
+ i = select_idle_core(p, sd, target);
|
|
+ if ((unsigned)i < nr_cpumask_bits)
|
|
+ return i;
|
|
+
|
|
+ i = select_idle_cpu(p, sd, target);
|
|
+ if ((unsigned)i < nr_cpumask_bits)
|
|
+ return i;
|
|
+
|
|
+ i = select_idle_smt(p, sd, target);
|
|
+ if ((unsigned)i < nr_cpumask_bits)
|
|
+ return i;
|
|
+ }
|
|
|
|
/*
|
|
- * If the previous cpu is cache affine and idle, don't be stupid.
|
|
+ * Otherwise, iterate the domains and find an elegible idle cpu.
|
|
*/
|
|
- if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
|
|
- return prev;
|
|
-
|
|
sd = rcu_dereference(per_cpu(sd_llc, target));
|
|
- if (!sd)
|
|
- return target;
|
|
+ for_each_lower_domain(sd) {
|
|
+ sg = sd->groups;
|
|
+ do {
|
|
+ if (!cpumask_intersects(sched_group_cpus(sg),
|
|
+ tsk_cpus_allowed(p)))
|
|
+ goto next;
|
|
|
|
- i = select_idle_core(p, sd, target);
|
|
- if ((unsigned)i < nr_cpumask_bits)
|
|
- return i;
|
|
|
|
- i = select_idle_cpu(p, sd, target);
|
|
- if ((unsigned)i < nr_cpumask_bits)
|
|
- return i;
|
|
+ if (sysctl_sched_cstate_aware) {
|
|
+ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
|
|
+ int idle_idx = idle_get_state_idx(cpu_rq(i));
|
|
+ unsigned long new_usage = boosted_task_util(p);
|
|
+ unsigned long capacity_orig = capacity_orig_of(i);
|
|
|
|
- i = select_idle_smt(p, sd, target);
|
|
- if ((unsigned)i < nr_cpumask_bits)
|
|
- return i;
|
|
+ if (new_usage > capacity_orig || !idle_cpu(i))
|
|
+ goto next;
|
|
+
|
|
+ if (i == target && new_usage <= capacity_curr_of(target)) {
|
|
+ schedstat_inc(p->se.statistics.nr_wakeups_sis_suff_cap);
|
|
+ schedstat_inc(this_rq()->eas_stats.sis_suff_cap);
|
|
+ schedstat_inc(sd->eas_stats.sis_suff_cap);
|
|
+ return target;
|
|
+ }
|
|
+
|
|
+ if (idle_idx < best_idle_cstate &&
|
|
+ capacity_orig <= best_idle_capacity) {
|
|
+ best_idle_cpu = i;
|
|
+ best_idle_cstate = idle_idx;
|
|
+ best_idle_capacity = capacity_orig;
|
|
+ }
|
|
+ }
|
|
+ } else {
|
|
+ for_each_cpu(i, sched_group_cpus(sg)) {
|
|
+ if (i == target || !idle_cpu(i))
|
|
+ goto next;
|
|
+ }
|
|
+
|
|
+ target = cpumask_first_and(sched_group_cpus(sg),
|
|
+ tsk_cpus_allowed(p));
|
|
+ schedstat_inc(p->se.statistics.nr_wakeups_sis_idle_cpu);
|
|
+ schedstat_inc(this_rq()->eas_stats.sis_idle_cpu);
|
|
+ schedstat_inc(sd->eas_stats.sis_idle_cpu);
|
|
+ goto done;
|
|
+ }
|
|
+next:
|
|
+ sg = sg->next;
|
|
+ } while (sg != sd->groups);
|
|
+ }
|
|
+
|
|
+ if (best_idle_cpu >= 0)
|
|
+ target = best_idle_cpu;
|
|
+
|
|
+done:
|
|
+ schedstat_inc(p->se.statistics.nr_wakeups_sis_count);
|
|
+ schedstat_inc(this_rq()->eas_stats.sis_count);
|
|
|
|
return target;
|
|
}
|
|
-
|
|
+
|
|
/*
|
|
- * cpu_util returns the amount of capacity of a CPU that is used by CFS
|
|
- * tasks. The unit of the return value must be the one of capacity so we can
|
|
- * compare the utilization with the capacity of the CPU that is available for
|
|
- * CFS task (ie cpu_capacity).
|
|
- *
|
|
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
|
|
- * recent utilization of currently non-runnable tasks on a CPU. It represents
|
|
- * the amount of utilization of a CPU in the range [0..capacity_orig] where
|
|
- * capacity_orig is the cpu_capacity available at the highest frequency
|
|
- * (arch_scale_freq_capacity()).
|
|
- * The utilization of a CPU converges towards a sum equal to or less than the
|
|
- * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
|
|
- * the running time on this CPU scaled by capacity_curr.
|
|
- *
|
|
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
|
|
- * higher than capacity_orig because of unfortunate rounding in
|
|
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
|
|
- * the average stabilizes with the new running time. We need to check that the
|
|
- * utilization stays within the range of [0..capacity_orig] and cap it if
|
|
- * necessary. Without utilization capping, a group could be seen as overloaded
|
|
- * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
|
|
- * available capacity. We allow utilization to overshoot capacity_curr (but not
|
|
- * capacity_orig) as it useful for predicting the capacity required after task
|
|
- * migrations (scheduler-driven DVFS).
|
|
+ * cpu_util_wake: Compute cpu utilization with any contributions from
|
|
+ * the waking task p removed. check_for_migration() looks for a better CPU of
|
|
+ * rq->curr. For that case we should return cpu util with contributions from
|
|
+ * currently running task p removed.
|
|
*/
|
|
-static int cpu_util(int cpu)
|
|
+static int cpu_util_wake(int cpu, struct task_struct *p)
|
|
{
|
|
- unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
|
|
- unsigned long capacity = capacity_orig_of(cpu);
|
|
+ unsigned long util, capacity;
|
|
+
|
|
+#ifdef CONFIG_SCHED_WALT
|
|
+ /*
|
|
+ * WALT does not decay idle tasks in the same manner
|
|
+ * as PELT, so it makes little sense to subtract task
|
|
+ * utilization from cpu utilization. Instead just use
|
|
+ * cpu_util for this case.
|
|
+ */
|
|
+ if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
|
|
+ p->state == TASK_WAKING)
|
|
+ return cpu_util(cpu);
|
|
+#endif
|
|
+ /* Task has no contribution or is new */
|
|
+ if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
|
|
+ return cpu_util(cpu);
|
|
+
|
|
+ capacity = capacity_orig_of(cpu);
|
|
+ util = max_t(long, cpu_util(cpu) - task_util(p), 0);
|
|
|
|
return (util >= capacity) ? capacity : util;
|
|
}
|
|
|
|
-static inline int task_util(struct task_struct *p)
|
|
+static int start_cpu(bool boosted)
|
|
{
|
|
- return p->se.avg.util_avg;
|
|
+ struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
|
|
+
|
|
+ return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
|
|
+}
|
|
+
|
|
+static inline int find_best_target(struct task_struct *p, int *backup_cpu,
|
|
+ bool boosted, bool prefer_idle)
|
|
+{
|
|
+ unsigned long best_idle_min_cap_orig = ULONG_MAX;
|
|
+ unsigned long min_util = boosted_task_util(p);
|
|
+ unsigned long target_capacity = ULONG_MAX;
|
|
+ unsigned long min_wake_util = ULONG_MAX;
|
|
+ unsigned long target_max_spare_cap = 0;
|
|
+ unsigned long target_util = ULONG_MAX;
|
|
+ unsigned long best_active_util = ULONG_MAX;
|
|
+ unsigned long target_idle_max_spare_cap = 0;
|
|
+ int best_idle_cstate = INT_MAX;
|
|
+ struct sched_domain *sd;
|
|
+ struct sched_group *sg;
|
|
+ int best_active_cpu = -1;
|
|
+ int best_idle_cpu = -1;
|
|
+ int target_cpu = -1;
|
|
+ int cpu, i;
|
|
+
|
|
+ *backup_cpu = -1;
|
|
+
|
|
+ schedstat_inc(p->se.statistics.nr_wakeups_fbt_attempts);
|
|
+ schedstat_inc(this_rq()->eas_stats.fbt_attempts);
|
|
+
|
|
+ /* Find start CPU based on boost value */
|
|
+ cpu = start_cpu(boosted);
|
|
+ if (cpu < 0) {
|
|
+ schedstat_inc(p->se.statistics.nr_wakeups_fbt_no_cpu);
|
|
+ schedstat_inc(this_rq()->eas_stats.fbt_no_cpu);
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ /* Find SD for the start CPU */
|
|
+ sd = rcu_dereference(per_cpu(sd_ea, cpu));
|
|
+ if (!sd) {
|
|
+ schedstat_inc(p->se.statistics.nr_wakeups_fbt_no_sd);
|
|
+ schedstat_inc(this_rq()->eas_stats.fbt_no_sd);
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ /* Scan CPUs in all SDs */
|
|
+ sg = sd->groups;
|
|
+ do {
|
|
+ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
|
|
+ unsigned long capacity_curr = capacity_curr_of(i);
|
|
+ unsigned long capacity_orig = capacity_orig_of(i);
|
|
+ unsigned long wake_util, new_util, min_capped_util;
|
|
+
|
|
+ if (!cpu_online(i))
|
|
+ continue;
|
|
+
|
|
+ if (walt_cpu_high_irqload(i))
|
|
+ continue;
|
|
+
|
|
+ /*
|
|
+ * p's blocked utilization is still accounted for on prev_cpu
|
|
+ * so prev_cpu will receive a negative bias due to the double
|
|
+ * accounting. However, the blocked utilization may be zero.
|
|
+ */
|
|
+ wake_util = cpu_util_wake(i, p);
|
|
+ new_util = wake_util + task_util(p);
|
|
+
|
|
+ /*
|
|
+ * Ensure minimum capacity to grant the required boost.
|
|
+ * The target CPU can be already at a capacity level higher
|
|
+ * than the one required to boost the task.
|
|
+ */
|
|
+ new_util = max(min_util, new_util);
|
|
+
|
|
+ /*
|
|
+ * Include minimum capacity constraint:
|
|
+ * new_util contains the required utilization including
|
|
+ * boost. min_capped_util also takes into account a
|
|
+ * minimum capacity cap imposed on the CPU by external
|
|
+ * actors.
|
|
+ */
|
|
+ min_capped_util = max(new_util, capacity_min_of(i));
|
|
+
|
|
+ if (new_util > capacity_orig)
|
|
+ continue;
|
|
+
|
|
+ /*
|
|
+ * Case A) Latency sensitive tasks
|
|
+ *
|
|
+ * Unconditionally favoring tasks that prefer idle CPU to
|
|
+ * improve latency.
|
|
+ *
|
|
+ * Looking for:
|
|
+ * - an idle CPU, whatever its idle_state is, since
|
|
+ * the first CPUs we explore are more likely to be
|
|
+ * reserved for latency sensitive tasks.
|
|
+ * - a non idle CPU where the task fits in its current
|
|
+ * capacity and has the maximum spare capacity.
|
|
+ * - a non idle CPU with lower contention from other
|
|
+ * tasks and running at the lowest possible OPP.
|
|
+ *
|
|
+ * The last two goals tries to favor a non idle CPU
|
|
+ * where the task can run as if it is "almost alone".
|
|
+ * A maximum spare capacity CPU is favoured since
|
|
+ * the task already fits into that CPU's capacity
|
|
+ * without waiting for an OPP chance.
|
|
+ *
|
|
+ * The following code path is the only one in the CPUs
|
|
+ * exploration loop which is always used by
|
|
+ * prefer_idle tasks. It exits the loop with wither a
|
|
+ * best_active_cpu or a target_cpu which should
|
|
+ * represent an optimal choice for latency sensitive
|
|
+ * tasks.
|
|
+ */
|
|
+ if (prefer_idle) {
|
|
+
|
|
+ /*
|
|
+ * Case A.1: IDLE CPU
|
|
+ * Return the first IDLE CPU we find.
|
|
+ */
|
|
+ if (idle_cpu(i)) {
|
|
+ schedstat_inc(p->se.statistics.nr_wakeups_fbt_pref_idle);
|
|
+ schedstat_inc(this_rq()->eas_stats.fbt_pref_idle);
|
|
+
|
|
+ trace_sched_find_best_target(p,
|
|
+ prefer_idle, min_util,
|
|
+ cpu, best_idle_cpu,
|
|
+ best_active_cpu, i);
|
|
+
|
|
+ return i;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Case A.2: Target ACTIVE CPU
|
|
+ * Favor CPUs with max spare capacity.
|
|
+ */
|
|
+ if ((capacity_curr > new_util) &&
|
|
+ (capacity_orig - new_util > target_max_spare_cap)) {
|
|
+ target_max_spare_cap = capacity_orig - new_util;
|
|
+ target_cpu = i;
|
|
+ continue;
|
|
+ }
|
|
+ if (target_cpu != -1)
|
|
+ continue;
|
|
+
|
|
+
|
|
+ /*
|
|
+ * Case A.3: Backup ACTIVE CPU
|
|
+ * Favor CPUs with:
|
|
+ * - lower utilization due to other tasks
|
|
+ * - lower utilization with the task in
|
|
+ */
|
|
+ if (wake_util > min_wake_util)
|
|
+ continue;
|
|
+ if (new_util > best_active_util)
|
|
+ continue;
|
|
+ min_wake_util = wake_util;
|
|
+ best_active_util = new_util;
|
|
+ best_active_cpu = i;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Enforce EAS mode
|
|
+ *
|
|
+ * For non latency sensitive tasks, skip CPUs that
|
|
+ * will be overutilized by moving the task there.
|
|
+ *
|
|
+ * The goal here is to remain in EAS mode as long as
|
|
+ * possible at least for !prefer_idle tasks.
|
|
+ */
|
|
+ if ((new_util * capacity_margin) >
|
|
+ (capacity_orig * SCHED_CAPACITY_SCALE))
|
|
+ continue;
|
|
+
|
|
+ /*
|
|
+ * Case B) Non latency sensitive tasks on IDLE CPUs.
|
|
+ *
|
|
+ * Find an optimal backup IDLE CPU for non latency
|
|
+ * sensitive tasks.
|
|
+ *
|
|
+ * Looking for:
|
|
+ * - minimizing the capacity_orig,
|
|
+ * i.e. preferring LITTLE CPUs
|
|
+ * - favoring shallowest idle states
|
|
+ * i.e. avoid to wakeup deep-idle CPUs
|
|
+ *
|
|
+ * The following code path is used by non latency
|
|
+ * sensitive tasks if IDLE CPUs are available. If at
|
|
+ * least one of such CPUs are available it sets the
|
|
+ * best_idle_cpu to the most suitable idle CPU to be
|
|
+ * selected.
|
|
+ *
|
|
+ * If idle CPUs are available, favour these CPUs to
|
|
+ * improve performances by spreading tasks.
|
|
+ * Indeed, the energy_diff() computed by the caller
|
|
+ * will take care to ensure the minimization of energy
|
|
+ * consumptions without affecting performance.
|
|
+ */
|
|
+ if (idle_cpu(i)) {
|
|
+ int idle_idx = idle_get_state_idx(cpu_rq(i));
|
|
+
|
|
+ /* Select idle CPU with lower cap_orig */
|
|
+ if (capacity_orig > best_idle_min_cap_orig)
|
|
+ continue;
|
|
+ /* Favor CPUs that won't end up running at a
|
|
+ * high OPP.
|
|
+ */
|
|
+ if ((capacity_orig - min_capped_util) <
|
|
+ target_idle_max_spare_cap)
|
|
+ continue;
|
|
+
|
|
+ /*
|
|
+ * Skip CPUs in deeper idle state, but only
|
|
+ * if they are also less energy efficient.
|
|
+ * IOW, prefer a deep IDLE LITTLE CPU vs a
|
|
+ * shallow idle big CPU.
|
|
+ */
|
|
+ if (sysctl_sched_cstate_aware &&
|
|
+ best_idle_cstate <= idle_idx)
|
|
+ continue;
|
|
+
|
|
+ /* Keep track of best idle CPU */
|
|
+ best_idle_min_cap_orig = capacity_orig;
|
|
+ target_idle_max_spare_cap = capacity_orig -
|
|
+ min_capped_util;
|
|
+ best_idle_cstate = idle_idx;
|
|
+ best_idle_cpu = i;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Case C) Non latency sensitive tasks on ACTIVE CPUs.
|
|
+ *
|
|
+ * Pack tasks in the most energy efficient capacities.
|
|
+ *
|
|
+ * This task packing strategy prefers more energy
|
|
+ * efficient CPUs (i.e. pack on smaller maximum
|
|
+ * capacity CPUs) while also trying to spread tasks to
|
|
+ * run them all at the lower OPP.
|
|
+ *
|
|
+ * This assumes for example that it's more energy
|
|
+ * efficient to run two tasks on two CPUs at a lower
|
|
+ * OPP than packing both on a single CPU but running
|
|
+ * that CPU at an higher OPP.
|
|
+ *
|
|
+ * Thus, this case keep track of the CPU with the
|
|
+ * smallest maximum capacity and highest spare maximum
|
|
+ * capacity.
|
|
+ */
|
|
+
|
|
+ /* Favor CPUs with smaller capacity */
|
|
+ if (capacity_orig > target_capacity)
|
|
+ continue;
|
|
+
|
|
+ /* Favor CPUs with maximum spare capacity */
|
|
+ if ((capacity_orig - min_capped_util) <
|
|
+ target_max_spare_cap)
|
|
+ continue;
|
|
+
|
|
+ target_max_spare_cap = capacity_orig - min_capped_util;
|
|
+ target_capacity = capacity_orig;
|
|
+ target_util = new_util;
|
|
+ target_cpu = i;
|
|
+ }
|
|
+
|
|
+ } while (sg = sg->next, sg != sd->groups);
|
|
+
|
|
+ /*
|
|
+ * For non latency sensitive tasks, cases B and C in the previous loop,
|
|
+ * we pick the best IDLE CPU only if we was not able to find a target
|
|
+ * ACTIVE CPU.
|
|
+ *
|
|
+ * Policies priorities:
|
|
+ *
|
|
+ * - prefer_idle tasks:
|
|
+ *
|
|
+ * a) IDLE CPU available, we return immediately
|
|
+ * b) ACTIVE CPU where task fits and has the bigger maximum spare
|
|
+ * capacity (i.e. target_cpu)
|
|
+ * c) ACTIVE CPU with less contention due to other tasks
|
|
+ * (i.e. best_active_cpu)
|
|
+ *
|
|
+ * - NON prefer_idle tasks:
|
|
+ *
|
|
+ * a) ACTIVE CPU: target_cpu
|
|
+ * b) IDLE CPU: best_idle_cpu
|
|
+ */
|
|
+ if (target_cpu == -1)
|
|
+ target_cpu = prefer_idle
|
|
+ ? best_active_cpu
|
|
+ : best_idle_cpu;
|
|
+ else
|
|
+ *backup_cpu = prefer_idle
|
|
+ ? best_active_cpu
|
|
+ : best_idle_cpu;
|
|
+
|
|
+ trace_sched_find_best_target(p, prefer_idle, min_util, cpu,
|
|
+ best_idle_cpu, best_active_cpu,
|
|
+ target_cpu);
|
|
+
|
|
+ schedstat_inc(p->se.statistics.nr_wakeups_fbt_count);
|
|
+ schedstat_inc(this_rq()->eas_stats.fbt_count);
|
|
+
|
|
+ return target_cpu;
|
|
}
|
|
|
|
/*
|
|
* Disable WAKE_AFFINE in the case where task @p doesn't fit in the
|
|
* capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
|
|
- *
|
|
+ *
|
|
* In that case WAKE_AFFINE doesn't make sense and we'll let
|
|
* BALANCE_WAKE sort things out.
|
|
*/
|
|
static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
|
|
{
|
|
long min_cap, max_cap;
|
|
-
|
|
min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
|
|
- max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
|
|
-
|
|
+ max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
|
|
/* Minimum capacity is close to max, no need to abort wake_affine */
|
|
if (max_cap - min_cap < max_cap >> 3)
|
|
return 0;
|
|
|
|
+ /* Bring task utilization in sync with prev_cpu */
|
|
+ sync_entity_load_avg(&p->se);
|
|
+
|
|
return min_cap * 1024 < task_util(p) * capacity_margin;
|
|
}
|
|
|
|
+static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
|
|
+{
|
|
+ bool boosted, prefer_idle;
|
|
+ struct sched_domain *sd;
|
|
+ int target_cpu;
|
|
+ int backup_cpu;
|
|
+ int next_cpu;
|
|
+
|
|
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_attempts);
|
|
+ schedstat_inc(this_rq()->eas_stats.secb_attempts);
|
|
+
|
|
+ if (sysctl_sched_sync_hint_enable && sync) {
|
|
+ int cpu = smp_processor_id();
|
|
+
|
|
+ if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
|
|
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_sync);
|
|
+ schedstat_inc(this_rq()->eas_stats.secb_sync);
|
|
+ return cpu;
|
|
+ }
|
|
+ }
|
|
+
|
|
+#ifdef CONFIG_CGROUP_SCHEDTUNE
|
|
+ boosted = schedtune_task_boost(p) > 0;
|
|
+ prefer_idle = schedtune_prefer_idle(p) > 0;
|
|
+#else
|
|
+ boosted = get_sysctl_sched_cfs_boost() > 0;
|
|
+ prefer_idle = 0;
|
|
+#endif
|
|
+
|
|
+ rcu_read_lock();
|
|
+
|
|
+ sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
|
|
+ if (!sd) {
|
|
+ target_cpu = prev_cpu;
|
|
+ goto unlock;
|
|
+ }
|
|
+
|
|
+ sync_entity_load_avg(&p->se);
|
|
+
|
|
+ /* Find a cpu with sufficient capacity */
|
|
+ next_cpu = find_best_target(p, &backup_cpu, boosted, prefer_idle);
|
|
+ if (next_cpu == -1) {
|
|
+ target_cpu = prev_cpu;
|
|
+ goto unlock;
|
|
+ }
|
|
+
|
|
+ /* Unconditionally prefer IDLE CPUs for boosted/prefer_idle tasks */
|
|
+ if ((boosted || prefer_idle) && idle_cpu(next_cpu)) {
|
|
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_idle_bt);
|
|
+ schedstat_inc(this_rq()->eas_stats.secb_idle_bt);
|
|
+ target_cpu = next_cpu;
|
|
+ goto unlock;
|
|
+ }
|
|
+
|
|
+ target_cpu = prev_cpu;
|
|
+ if (next_cpu != prev_cpu) {
|
|
+ int delta = 0;
|
|
+ struct energy_env eenv = {
|
|
+ .p = p,
|
|
+ .util_delta = task_util(p),
|
|
+ /* Task's previous CPU candidate */
|
|
+ .cpu[EAS_CPU_PRV] = {
|
|
+ .cpu_id = prev_cpu,
|
|
+ },
|
|
+ /* Main alternative CPU candidate */
|
|
+ .cpu[EAS_CPU_NXT] = {
|
|
+ .cpu_id = next_cpu,
|
|
+ },
|
|
+ /* Backup alternative CPU candidate */
|
|
+ .cpu[EAS_CPU_BKP] = {
|
|
+ .cpu_id = backup_cpu,
|
|
+ },
|
|
+ };
|
|
+
|
|
+
|
|
+#ifdef CONFIG_SCHED_WALT
|
|
+ if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
|
|
+ p->state == TASK_WAKING)
|
|
+ delta = task_util(p);
|
|
+#endif
|
|
+ /* Not enough spare capacity on previous cpu */
|
|
+ if (__cpu_overutilized(prev_cpu, delta)) {
|
|
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_insuff_cap);
|
|
+ schedstat_inc(this_rq()->eas_stats.secb_insuff_cap);
|
|
+ target_cpu = next_cpu;
|
|
+ goto unlock;
|
|
+ }
|
|
+
|
|
+ /* Check if EAS_CPU_NXT is a more energy efficient CPU */
|
|
+ if (select_energy_cpu_idx(&eenv) != EAS_CPU_PRV) {
|
|
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_nrg_sav);
|
|
+ schedstat_inc(this_rq()->eas_stats.secb_nrg_sav);
|
|
+ target_cpu = eenv.cpu[eenv.next_idx].cpu_id;
|
|
+ goto unlock;
|
|
+ }
|
|
+
|
|
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_no_nrg_sav);
|
|
+ schedstat_inc(this_rq()->eas_stats.secb_no_nrg_sav);
|
|
+ target_cpu = prev_cpu;
|
|
+ goto unlock;
|
|
+ }
|
|
+
|
|
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_count);
|
|
+ schedstat_inc(this_rq()->eas_stats.secb_count);
|
|
+
|
|
+unlock:
|
|
+ rcu_read_unlock();
|
|
+ return target_cpu;
|
|
+}
|
|
+
|
|
/*
|
|
* select_task_rq_fair: Select target runqueue for the waking task in domains
|
|
* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
|
|
@@ -5591,10 +7192,13 @@ select_task_rq_fair(struct task_struct *
|
|
|
|
if (sd_flag & SD_BALANCE_WAKE) {
|
|
record_wakee(p);
|
|
- want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
|
|
- && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
|
|
+ want_affine = (!wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
|
|
+ cpumask_test_cpu(cpu, tsk_cpus_allowed(p)));
|
|
}
|
|
|
|
+ if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
|
|
+ return select_energy_cpu_brute(p, prev_cpu, sync);
|
|
+
|
|
rcu_read_lock();
|
|
for_each_domain(cpu, tmp) {
|
|
if (!(tmp->flags & SD_LOAD_BALANCE))
|
|
@@ -5622,43 +7226,21 @@ select_task_rq_fair(struct task_struct *
|
|
new_cpu = cpu;
|
|
}
|
|
|
|
+ if (sd && !(sd_flag & SD_BALANCE_FORK)) {
|
|
+ /*
|
|
+ * We're going to need the task's util for capacity_spare_wake
|
|
+ * in find_idlest_group. Sync it up to prev_cpu's
|
|
+ * last_update_time.
|
|
+ */
|
|
+ sync_entity_load_avg(&p->se);
|
|
+ }
|
|
+
|
|
if (!sd) {
|
|
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
|
|
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
|
|
|
|
- } else while (sd) {
|
|
- struct sched_group *group;
|
|
- int weight;
|
|
-
|
|
- if (!(sd->flags & sd_flag)) {
|
|
- sd = sd->child;
|
|
- continue;
|
|
- }
|
|
-
|
|
- group = find_idlest_group(sd, p, cpu, sd_flag);
|
|
- if (!group) {
|
|
- sd = sd->child;
|
|
- continue;
|
|
- }
|
|
-
|
|
- new_cpu = find_idlest_cpu(group, p, cpu);
|
|
- if (new_cpu == -1 || new_cpu == cpu) {
|
|
- /* Now try balancing at a lower domain level of cpu */
|
|
- sd = sd->child;
|
|
- continue;
|
|
- }
|
|
-
|
|
- /* Now try balancing at a lower domain level of new_cpu */
|
|
- cpu = new_cpu;
|
|
- weight = sd->span_weight;
|
|
- sd = NULL;
|
|
- for_each_domain(cpu, tmp) {
|
|
- if (weight <= tmp->span_weight)
|
|
- break;
|
|
- if (tmp->flags & sd_flag)
|
|
- sd = tmp;
|
|
- }
|
|
- /* while loop will break here if sd == NULL */
|
|
+ } else {
|
|
+ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
|
|
}
|
|
rcu_read_unlock();
|
|
|
|
@@ -5718,6 +7300,8 @@ static void task_dead_fair(struct task_s
|
|
{
|
|
remove_entity_load_avg(&p->se);
|
|
}
|
|
+#else
|
|
+#define task_fits_max(p, cpu) true
|
|
#endif /* CONFIG_SMP */
|
|
|
|
static unsigned long
|
|
@@ -5964,6 +7548,8 @@ again:
|
|
if (hrtick_enabled(rq))
|
|
hrtick_start_fair(rq, p);
|
|
|
|
+ rq->misfit_task = !task_fits_max(p, rq->cpu);
|
|
+
|
|
return p;
|
|
simple:
|
|
cfs_rq = &rq->cfs;
|
|
@@ -5985,9 +7571,12 @@ simple:
|
|
if (hrtick_enabled(rq))
|
|
hrtick_start_fair(rq, p);
|
|
|
|
+ rq->misfit_task = !task_fits_max(p, rq->cpu);
|
|
+
|
|
return p;
|
|
|
|
idle:
|
|
+ rq->misfit_task = 0;
|
|
/*
|
|
* This is OK, because current is on_cpu, which avoids it being picked
|
|
* for load-balance and preemption/IRQs are still disabled avoiding
|
|
@@ -6200,6 +7789,13 @@ static unsigned long __read_mostly max_l
|
|
|
|
enum fbq_type { regular, remote, all };
|
|
|
|
+enum group_type {
|
|
+ group_other = 0,
|
|
+ group_misfit_task,
|
|
+ group_imbalanced,
|
|
+ group_overloaded,
|
|
+};
|
|
+
|
|
#define LBF_ALL_PINNED 0x01
|
|
#define LBF_NEED_BREAK 0x02
|
|
#define LBF_DST_PINNED 0x04
|
|
@@ -6218,6 +7814,7 @@ struct lb_env {
|
|
int new_dst_cpu;
|
|
enum cpu_idle_type idle;
|
|
long imbalance;
|
|
+ unsigned int src_grp_nr_running;
|
|
/* The set of CPUs under consideration for load-balancing */
|
|
struct cpumask *cpus;
|
|
|
|
@@ -6228,6 +7825,7 @@ struct lb_env {
|
|
unsigned int loop_max;
|
|
|
|
enum fbq_type fbq_type;
|
|
+ enum group_type busiest_group_type;
|
|
struct list_head tasks;
|
|
};
|
|
|
|
@@ -6409,7 +8007,9 @@ static void detach_task(struct task_stru
|
|
|
|
p->on_rq = TASK_ON_RQ_MIGRATING;
|
|
deactivate_task(env->src_rq, p, 0);
|
|
+ double_lock_balance(env->src_rq, env->dst_rq);
|
|
set_task_cpu(p, env->dst_cpu);
|
|
+ double_unlock_balance(env->src_rq, env->dst_rq);
|
|
}
|
|
|
|
/*
|
|
@@ -6593,12 +8193,19 @@ static void update_blocked_averages(int
|
|
* list_add_leaf_cfs_rq() for details.
|
|
*/
|
|
for_each_leaf_cfs_rq(rq, cfs_rq) {
|
|
+ struct sched_entity *se;
|
|
+
|
|
/* throttled entities do not contribute to load */
|
|
if (throttled_hierarchy(cfs_rq))
|
|
continue;
|
|
|
|
if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
|
|
update_tg_load_avg(cfs_rq, 0);
|
|
+
|
|
+ /* Propagate pending load changes to the parent, if any: */
|
|
+ se = cfs_rq->tg->se[cpu];
|
|
+ if (se && !skip_blocked_update(se))
|
|
+ update_load_avg(se, 0);
|
|
}
|
|
raw_spin_unlock_irqrestore(&rq->lock, flags);
|
|
}
|
|
@@ -6670,12 +8277,6 @@ static unsigned long task_h_load(struct
|
|
|
|
/********** Helpers for find_busiest_group ************************/
|
|
|
|
-enum group_type {
|
|
- group_other = 0,
|
|
- group_imbalanced,
|
|
- group_overloaded,
|
|
-};
|
|
-
|
|
/*
|
|
* sg_lb_stats - stats of a sched_group required for load_balancing
|
|
*/
|
|
@@ -6691,6 +8292,7 @@ struct sg_lb_stats {
|
|
unsigned int group_weight;
|
|
enum group_type group_type;
|
|
int group_no_capacity;
|
|
+ int group_misfit_task; /* A cpu has a task too big for its capacity */
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
unsigned int nr_numa_running;
|
|
unsigned int nr_preferred_running;
|
|
@@ -6788,13 +8390,46 @@ static unsigned long scale_rt_capacity(i
|
|
return 1;
|
|
}
|
|
|
|
+void init_max_cpu_capacity(struct max_cpu_capacity *mcc)
|
|
+{
|
|
+ raw_spin_lock_init(&mcc->lock);
|
|
+ mcc->val = 0;
|
|
+ mcc->cpu = -1;
|
|
+}
|
|
+
|
|
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
|
|
{
|
|
unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
|
|
struct sched_group *sdg = sd->groups;
|
|
+ struct max_cpu_capacity *mcc;
|
|
+ unsigned long max_capacity;
|
|
+ int max_cap_cpu;
|
|
+ unsigned long flags;
|
|
|
|
cpu_rq(cpu)->cpu_capacity_orig = capacity;
|
|
|
|
+ capacity *= arch_scale_max_freq_capacity(sd, cpu);
|
|
+ capacity >>= SCHED_CAPACITY_SHIFT;
|
|
+
|
|
+ mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
|
|
+
|
|
+ raw_spin_lock_irqsave(&mcc->lock, flags);
|
|
+ max_capacity = mcc->val;
|
|
+ max_cap_cpu = mcc->cpu;
|
|
+
|
|
+ if ((max_capacity > capacity && max_cap_cpu == cpu) ||
|
|
+ (max_capacity < capacity)) {
|
|
+ mcc->val = capacity;
|
|
+ mcc->cpu = cpu;
|
|
+#ifdef CONFIG_SCHED_DEBUG
|
|
+ raw_spin_unlock_irqrestore(&mcc->lock, flags);
|
|
+ pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity);
|
|
+ goto skip_unlock;
|
|
+#endif
|
|
+ }
|
|
+ raw_spin_unlock_irqrestore(&mcc->lock, flags);
|
|
+
|
|
+skip_unlock: __attribute__ ((unused));
|
|
capacity *= scale_rt_capacity(cpu);
|
|
capacity >>= SCHED_CAPACITY_SHIFT;
|
|
|
|
@@ -6803,13 +8438,15 @@ static void update_cpu_capacity(struct s
|
|
|
|
cpu_rq(cpu)->cpu_capacity = capacity;
|
|
sdg->sgc->capacity = capacity;
|
|
+ sdg->sgc->max_capacity = capacity;
|
|
+ sdg->sgc->min_capacity = capacity;
|
|
}
|
|
|
|
void update_group_capacity(struct sched_domain *sd, int cpu)
|
|
{
|
|
struct sched_domain *child = sd->child;
|
|
struct sched_group *group, *sdg = sd->groups;
|
|
- unsigned long capacity;
|
|
+ unsigned long capacity, max_capacity, min_capacity;
|
|
unsigned long interval;
|
|
|
|
interval = msecs_to_jiffies(sd->balance_interval);
|
|
@@ -6822,6 +8459,8 @@ void update_group_capacity(struct sched_
|
|
}
|
|
|
|
capacity = 0;
|
|
+ max_capacity = 0;
|
|
+ min_capacity = ULONG_MAX;
|
|
|
|
if (child->flags & SD_OVERLAP) {
|
|
/*
|
|
@@ -6846,11 +8485,13 @@ void update_group_capacity(struct sched_
|
|
*/
|
|
if (unlikely(!rq->sd)) {
|
|
capacity += capacity_of(cpu);
|
|
- continue;
|
|
+ } else {
|
|
+ sgc = rq->sd->groups->sgc;
|
|
+ capacity += sgc->capacity;
|
|
}
|
|
|
|
- sgc = rq->sd->groups->sgc;
|
|
- capacity += sgc->capacity;
|
|
+ max_capacity = max(capacity, max_capacity);
|
|
+ min_capacity = min(capacity, min_capacity);
|
|
}
|
|
} else {
|
|
/*
|
|
@@ -6860,12 +8501,18 @@ void update_group_capacity(struct sched_
|
|
|
|
group = child->groups;
|
|
do {
|
|
- capacity += group->sgc->capacity;
|
|
+ struct sched_group_capacity *sgc = group->sgc;
|
|
+
|
|
+ capacity += sgc->capacity;
|
|
+ max_capacity = max(sgc->max_capacity, max_capacity);
|
|
+ min_capacity = min(sgc->min_capacity, min_capacity);
|
|
group = group->next;
|
|
} while (group != child->groups);
|
|
}
|
|
|
|
sdg->sgc->capacity = capacity;
|
|
+ sdg->sgc->max_capacity = max_capacity;
|
|
+ sdg->sgc->min_capacity = min_capacity;
|
|
}
|
|
|
|
/*
|
|
@@ -6960,6 +8607,17 @@ group_is_overloaded(struct lb_env *env,
|
|
return false;
|
|
}
|
|
|
|
+/*
|
|
+ * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
|
|
+ * per-cpu capacity than sched_group ref.
|
|
+ */
|
|
+static inline bool
|
|
+group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
|
|
+{
|
|
+ return sg->sgc->max_capacity + capacity_margin - SCHED_CAPACITY_SCALE <
|
|
+ ref->sgc->max_capacity;
|
|
+}
|
|
+
|
|
static inline enum
|
|
group_type group_classify(struct sched_group *group,
|
|
struct sg_lb_stats *sgs)
|
|
@@ -6970,9 +8628,44 @@ group_type group_classify(struct sched_g
|
|
if (sg_imbalanced(group))
|
|
return group_imbalanced;
|
|
|
|
+ if (sgs->group_misfit_task)
|
|
+ return group_misfit_task;
|
|
+
|
|
return group_other;
|
|
}
|
|
|
|
+#ifdef CONFIG_NO_HZ_COMMON
|
|
+/*
|
|
+ * idle load balancing data
|
|
+ * - used by the nohz balance, but we want it available here
|
|
+ * so that we can see which CPUs have no tick.
|
|
+ */
|
|
+static struct {
|
|
+ cpumask_var_t idle_cpus_mask;
|
|
+ atomic_t nr_cpus;
|
|
+ unsigned long next_balance; /* in jiffy units */
|
|
+} nohz ____cacheline_aligned;
|
|
+
|
|
+static inline void update_cpu_stats_if_tickless(struct rq *rq)
|
|
+{
|
|
+ /* only called from update_sg_lb_stats when irqs are disabled */
|
|
+ if (cpumask_test_cpu(rq->cpu, nohz.idle_cpus_mask)) {
|
|
+ /* rate limit updates to once-per-jiffie at most */
|
|
+ if (READ_ONCE(jiffies) <= rq->last_load_update_tick)
|
|
+ return;
|
|
+
|
|
+ raw_spin_lock(&rq->lock);
|
|
+ update_rq_clock(rq);
|
|
+ cpu_load_update_idle(rq);
|
|
+ update_cfs_rq_load_avg(rq->clock_task, &rq->cfs, false);
|
|
+ raw_spin_unlock(&rq->lock);
|
|
+ }
|
|
+}
|
|
+
|
|
+#else
|
|
+static inline void update_cpu_stats_if_tickless(struct rq *rq) { }
|
|
+#endif
|
|
+
|
|
/**
|
|
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
|
|
* @env: The load balancing environment.
|
|
@@ -6981,11 +8674,12 @@ group_type group_classify(struct sched_g
|
|
* @local_group: Does group contain this_cpu.
|
|
* @sgs: variable to hold the statistics for this group.
|
|
* @overload: Indicate more than one runnable task for any CPU.
|
|
+ * @overutilized: Indicate overutilization for any CPU.
|
|
*/
|
|
static inline void update_sg_lb_stats(struct lb_env *env,
|
|
struct sched_group *group, int load_idx,
|
|
int local_group, struct sg_lb_stats *sgs,
|
|
- bool *overload)
|
|
+ bool *overload, bool *overutilized)
|
|
{
|
|
unsigned long load;
|
|
int i, nr_running;
|
|
@@ -6995,6 +8689,12 @@ static inline void update_sg_lb_stats(st
|
|
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
|
|
struct rq *rq = cpu_rq(i);
|
|
|
|
+ /* if we are entering idle and there are CPUs with
|
|
+ * their tick stopped, do an update for them
|
|
+ */
|
|
+ if (env->idle == CPU_NEWLY_IDLE)
|
|
+ update_cpu_stats_if_tickless(rq);
|
|
+
|
|
/* Bias balancing toward cpus of our domain */
|
|
if (local_group)
|
|
load = target_load(i, load_idx);
|
|
@@ -7019,6 +8719,12 @@ static inline void update_sg_lb_stats(st
|
|
*/
|
|
if (!nr_running && idle_cpu(i))
|
|
sgs->idle_cpus++;
|
|
+
|
|
+ if (cpu_overutilized(i)) {
|
|
+ *overutilized = true;
|
|
+ if (!sgs->group_misfit_task && rq->misfit_task)
|
|
+ sgs->group_misfit_task = capacity_of(i);
|
|
+ }
|
|
}
|
|
|
|
/* Adjust by relative CPU capacity of the group */
|
|
@@ -7060,9 +8766,31 @@ static bool update_sd_pick_busiest(struc
|
|
if (sgs->group_type < busiest->group_type)
|
|
return false;
|
|
|
|
+ /*
|
|
+ * Candidate sg doesn't face any serious load-balance problems
|
|
+ * so don't pick it if the local sg is already filled up.
|
|
+ */
|
|
+ if (sgs->group_type == group_other &&
|
|
+ !group_has_capacity(env, &sds->local_stat))
|
|
+ return false;
|
|
+
|
|
if (sgs->avg_load <= busiest->avg_load)
|
|
return false;
|
|
|
|
+ if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
|
|
+ goto asym_packing;
|
|
+
|
|
+ /*
|
|
+ * Candidate sg has no more than one task per CPU and
|
|
+ * has higher per-CPU capacity. Migrating tasks to less
|
|
+ * capable CPUs may harm throughput. Maximize throughput,
|
|
+ * power/energy consequences are not considered.
|
|
+ */
|
|
+ if (sgs->sum_nr_running <= sgs->group_weight &&
|
|
+ group_smaller_cpu_capacity(sds->local, sg))
|
|
+ return false;
|
|
+
|
|
+asym_packing:
|
|
/* This is the busiest node in its class. */
|
|
if (!(env->sd->flags & SD_ASYM_PACKING))
|
|
return true;
|
|
@@ -7117,6 +8845,9 @@ static inline enum fbq_type fbq_classify
|
|
}
|
|
#endif /* CONFIG_NUMA_BALANCING */
|
|
|
|
+#define lb_sd_parent(sd) \
|
|
+ (sd->parent && sd->parent->groups != sd->parent->groups->next)
|
|
+
|
|
/**
|
|
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
|
|
* @env: The load balancing environment.
|
|
@@ -7128,7 +8859,7 @@ static inline void update_sd_lb_stats(st
|
|
struct sched_group *sg = env->sd->groups;
|
|
struct sg_lb_stats tmp_sgs;
|
|
int load_idx, prefer_sibling = 0;
|
|
- bool overload = false;
|
|
+ bool overload = false, overutilized = false;
|
|
|
|
if (child && child->flags & SD_PREFER_SIBLING)
|
|
prefer_sibling = 1;
|
|
@@ -7150,7 +8881,7 @@ static inline void update_sd_lb_stats(st
|
|
}
|
|
|
|
update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
|
|
- &overload);
|
|
+ &overload, &overutilized);
|
|
|
|
if (local_group)
|
|
goto next_group;
|
|
@@ -7172,6 +8903,15 @@ static inline void update_sd_lb_stats(st
|
|
sgs->group_type = group_classify(sg, sgs);
|
|
}
|
|
|
|
+ /*
|
|
+ * Ignore task groups with misfit tasks if local group has no
|
|
+ * capacity or if per-cpu capacity isn't higher.
|
|
+ */
|
|
+ if (sgs->group_type == group_misfit_task &&
|
|
+ (!group_has_capacity(env, &sds->local_stat) ||
|
|
+ !group_smaller_cpu_capacity(sg, sds->local)))
|
|
+ sgs->group_type = group_other;
|
|
+
|
|
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
|
|
sds->busiest = sg;
|
|
sds->busiest_stat = *sgs;
|
|
@@ -7188,10 +8928,23 @@ next_group:
|
|
if (env->sd->flags & SD_NUMA)
|
|
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
|
|
|
|
- if (!env->sd->parent) {
|
|
+ env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
|
|
+
|
|
+ if (!lb_sd_parent(env->sd)) {
|
|
/* update overload indicator if we are at root domain */
|
|
if (env->dst_rq->rd->overload != overload)
|
|
env->dst_rq->rd->overload = overload;
|
|
+
|
|
+ /* Update over-utilization (tipping point, U >= 0) indicator */
|
|
+ if (env->dst_rq->rd->overutilized != overutilized) {
|
|
+ env->dst_rq->rd->overutilized = overutilized;
|
|
+ trace_sched_overutilized(overutilized);
|
|
+ }
|
|
+ } else {
|
|
+ if (!env->dst_rq->rd->overutilized && overutilized) {
|
|
+ env->dst_rq->rd->overutilized = true;
|
|
+ trace_sched_overutilized(true);
|
|
+ }
|
|
}
|
|
|
|
}
|
|
@@ -7344,6 +9097,22 @@ static inline void calculate_imbalance(s
|
|
*/
|
|
if (busiest->avg_load <= sds->avg_load ||
|
|
local->avg_load >= sds->avg_load) {
|
|
+ /* Misfitting tasks should be migrated in any case */
|
|
+ if (busiest->group_type == group_misfit_task) {
|
|
+ env->imbalance = busiest->group_misfit_task;
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Busiest group is overloaded, local is not, use the spare
|
|
+ * cycles to maximize throughput
|
|
+ */
|
|
+ if (busiest->group_type == group_overloaded &&
|
|
+ local->group_type <= group_misfit_task) {
|
|
+ env->imbalance = busiest->load_per_task;
|
|
+ return;
|
|
+ }
|
|
+
|
|
env->imbalance = 0;
|
|
return fix_small_imbalance(env, sds);
|
|
}
|
|
@@ -7377,6 +9146,11 @@ static inline void calculate_imbalance(s
|
|
(sds->avg_load - local->avg_load) * local->group_capacity
|
|
) / SCHED_CAPACITY_SCALE;
|
|
|
|
+ /* Boost imbalance to allow misfit task to be balanced. */
|
|
+ if (busiest->group_type == group_misfit_task)
|
|
+ env->imbalance = max_t(long, env->imbalance,
|
|
+ busiest->group_misfit_task);
|
|
+
|
|
/*
|
|
* if *imbalance is less than the average load per runnable task
|
|
* there is no guarantee that any tasks will be moved so we'll have
|
|
@@ -7412,6 +9186,10 @@ static struct sched_group *find_busiest_
|
|
* this level.
|
|
*/
|
|
update_sd_lb_stats(env, &sds);
|
|
+
|
|
+ if (energy_aware() && !env->dst_rq->rd->overutilized)
|
|
+ goto out_balanced;
|
|
+
|
|
local = &sds.local_stat;
|
|
busiest = &sds.busiest_stat;
|
|
|
|
@@ -7434,11 +9212,19 @@ static struct sched_group *find_busiest_
|
|
if (busiest->group_type == group_imbalanced)
|
|
goto force_balance;
|
|
|
|
- /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
|
|
- if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
|
|
+ /*
|
|
+ * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
|
|
+ * capacities from resulting in underutilization due to avg_load.
|
|
+ */
|
|
+ if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
|
|
busiest->group_no_capacity)
|
|
goto force_balance;
|
|
|
|
+ /* Misfitting tasks should be dealt with regardless of the avg load */
|
|
+ if (busiest->group_type == group_misfit_task) {
|
|
+ goto force_balance;
|
|
+ }
|
|
+
|
|
/*
|
|
* If the local group is busier than the selected busiest group
|
|
* don't try and pull any tasks.
|
|
@@ -7462,7 +9248,8 @@ static struct sched_group *find_busiest_
|
|
* might end up to just move the imbalance on another group
|
|
*/
|
|
if ((busiest->group_type != group_overloaded) &&
|
|
- (local->idle_cpus <= (busiest->idle_cpus + 1)))
|
|
+ (local->idle_cpus <= (busiest->idle_cpus + 1)) &&
|
|
+ !group_smaller_cpu_capacity(sds.busiest, sds.local))
|
|
goto out_balanced;
|
|
} else {
|
|
/*
|
|
@@ -7475,6 +9262,7 @@ static struct sched_group *find_busiest_
|
|
}
|
|
|
|
force_balance:
|
|
+ env->busiest_group_type = busiest->group_type;
|
|
/* Looks like there is an imbalance. Compute it */
|
|
calculate_imbalance(env, &sds);
|
|
return sds.busiest;
|
|
@@ -7533,7 +9321,8 @@ static struct rq *find_busiest_queue(str
|
|
*/
|
|
|
|
if (rq->nr_running == 1 && wl > env->imbalance &&
|
|
- !check_cpu_capacity(rq, env->sd))
|
|
+ !check_cpu_capacity(rq, env->sd) &&
|
|
+ env->busiest_group_type != group_misfit_task)
|
|
continue;
|
|
|
|
/*
|
|
@@ -7591,6 +9380,14 @@ static int need_active_balance(struct lb
|
|
return 1;
|
|
}
|
|
|
|
+ if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
|
|
+ ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) &&
|
|
+ env->src_rq->cfs.h_nr_running == 1 &&
|
|
+ cpu_overutilized(env->src_cpu) &&
|
|
+ !cpu_overutilized(env->dst_cpu)) {
|
|
+ return 1;
|
|
+ }
|
|
+
|
|
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
|
|
}
|
|
|
|
@@ -7639,7 +9436,7 @@ static int load_balance(int this_cpu, st
|
|
int *continue_balancing)
|
|
{
|
|
int ld_moved, cur_ld_moved, active_balance = 0;
|
|
- struct sched_domain *sd_parent = sd->parent;
|
|
+ struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
|
|
struct sched_group *group;
|
|
struct rq *busiest;
|
|
unsigned long flags;
|
|
@@ -7706,6 +9503,7 @@ redo:
|
|
|
|
more_balance:
|
|
raw_spin_lock_irqsave(&busiest->lock, flags);
|
|
+ update_rq_clock(busiest);
|
|
|
|
/*
|
|
* cur_ld_moved - load moved in current iteration
|
|
@@ -7803,7 +9601,8 @@ more_balance:
|
|
* excessive cache_hot migrations and active balances.
|
|
*/
|
|
if (idle != CPU_NEWLY_IDLE)
|
|
- sd->nr_balance_failed++;
|
|
+ if (env.src_grp_nr_running > 1)
|
|
+ sd->nr_balance_failed++;
|
|
|
|
if (need_active_balance(&env)) {
|
|
raw_spin_lock_irqsave(&busiest->lock, flags);
|
|
@@ -7940,8 +9739,9 @@ static int idle_balance(struct rq *this_
|
|
*/
|
|
this_rq->idle_stamp = rq_clock(this_rq);
|
|
|
|
- if (this_rq->avg_idle < sysctl_sched_migration_cost ||
|
|
- !this_rq->rd->overload) {
|
|
+ if (!energy_aware() &&
|
|
+ (this_rq->avg_idle < sysctl_sched_migration_cost ||
|
|
+ !this_rq->rd->overload)) {
|
|
rcu_read_lock();
|
|
sd = rcu_dereference_check_sched_domain(this_rq->sd);
|
|
if (sd)
|
|
@@ -8032,8 +9832,18 @@ static int active_load_balance_cpu_stop(
|
|
int busiest_cpu = cpu_of(busiest_rq);
|
|
int target_cpu = busiest_rq->push_cpu;
|
|
struct rq *target_rq = cpu_rq(target_cpu);
|
|
- struct sched_domain *sd;
|
|
+ struct sched_domain *sd = NULL;
|
|
struct task_struct *p = NULL;
|
|
+ struct task_struct *push_task = NULL;
|
|
+ int push_task_detached = 0;
|
|
+ struct lb_env env = {
|
|
+ .sd = sd,
|
|
+ .dst_cpu = target_cpu,
|
|
+ .dst_rq = target_rq,
|
|
+ .src_cpu = busiest_rq->cpu,
|
|
+ .src_rq = busiest_rq,
|
|
+ .idle = CPU_IDLE,
|
|
+ };
|
|
|
|
raw_spin_lock_irq(&busiest_rq->lock);
|
|
|
|
@@ -8053,6 +9863,17 @@ static int active_load_balance_cpu_stop(
|
|
*/
|
|
BUG_ON(busiest_rq == target_rq);
|
|
|
|
+ push_task = busiest_rq->push_task;
|
|
+ if (push_task) {
|
|
+ if (task_on_rq_queued(push_task) &&
|
|
+ task_cpu(push_task) == busiest_cpu &&
|
|
+ cpu_online(target_cpu)) {
|
|
+ detach_task(push_task, &env);
|
|
+ push_task_detached = 1;
|
|
+ }
|
|
+ goto out_unlock;
|
|
+ }
|
|
+
|
|
/* Search for an sd spanning us and the target CPU. */
|
|
rcu_read_lock();
|
|
for_each_domain(target_cpu, sd) {
|
|
@@ -8062,16 +9883,9 @@ static int active_load_balance_cpu_stop(
|
|
}
|
|
|
|
if (likely(sd)) {
|
|
- struct lb_env env = {
|
|
- .sd = sd,
|
|
- .dst_cpu = target_cpu,
|
|
- .dst_rq = target_rq,
|
|
- .src_cpu = busiest_rq->cpu,
|
|
- .src_rq = busiest_rq,
|
|
- .idle = CPU_IDLE,
|
|
- };
|
|
-
|
|
+ env.sd = sd;
|
|
schedstat_inc(sd->alb_count);
|
|
+ update_rq_clock(busiest_rq);
|
|
|
|
p = detach_one_task(&env);
|
|
if (p) {
|
|
@@ -8085,8 +9899,18 @@ static int active_load_balance_cpu_stop(
|
|
rcu_read_unlock();
|
|
out_unlock:
|
|
busiest_rq->active_balance = 0;
|
|
+
|
|
+ if (push_task)
|
|
+ busiest_rq->push_task = NULL;
|
|
+
|
|
raw_spin_unlock(&busiest_rq->lock);
|
|
|
|
+ if (push_task) {
|
|
+ if (push_task_detached)
|
|
+ attach_one_task(target_rq, push_task);
|
|
+ put_task_struct(push_task);
|
|
+ }
|
|
+
|
|
if (p)
|
|
attach_one_task(target_rq, p);
|
|
|
|
@@ -8107,12 +9931,6 @@ static inline int on_null_domain(struct
|
|
* needed, they will kick the idle load balancer, which then does idle
|
|
* load balancing for all the idle CPUs.
|
|
*/
|
|
-static struct {
|
|
- cpumask_var_t idle_cpus_mask;
|
|
- atomic_t nr_cpus;
|
|
- unsigned long next_balance; /* in jiffy units */
|
|
-} nohz ____cacheline_aligned;
|
|
-
|
|
static inline int find_new_ilb(void)
|
|
{
|
|
int ilb = cpumask_first(nohz.idle_cpus_mask);
|
|
@@ -8446,9 +10264,14 @@ static inline bool nohz_kick_needed(stru
|
|
if (time_before(now, nohz.next_balance))
|
|
return false;
|
|
|
|
- if (rq->nr_running >= 2)
|
|
+ if (rq->nr_running >= 2 &&
|
|
+ (!energy_aware() || cpu_overutilized(cpu)))
|
|
return true;
|
|
|
|
+ /* Do idle load balance if there have misfit task */
|
|
+ if (energy_aware())
|
|
+ return rq->misfit_task;
|
|
+
|
|
rcu_read_lock();
|
|
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
|
|
if (sds) {
|
|
@@ -8542,6 +10365,47 @@ static void rq_offline_fair(struct rq *r
|
|
unthrottle_offline_cfs_rqs(rq);
|
|
}
|
|
|
|
+static inline int
|
|
+kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
|
|
+{
|
|
+ int rc = 0;
|
|
+
|
|
+ /* Invoke active balance to force migrate currently running task */
|
|
+ raw_spin_lock(&rq->lock);
|
|
+ if (!rq->active_balance) {
|
|
+ rq->active_balance = 1;
|
|
+ rq->push_cpu = new_cpu;
|
|
+ get_task_struct(p);
|
|
+ rq->push_task = p;
|
|
+ rc = 1;
|
|
+ }
|
|
+ raw_spin_unlock(&rq->lock);
|
|
+
|
|
+ return rc;
|
|
+}
|
|
+
|
|
+void check_for_migration(struct rq *rq, struct task_struct *p)
|
|
+{
|
|
+ int new_cpu;
|
|
+ int active_balance;
|
|
+ int cpu = task_cpu(p);
|
|
+
|
|
+ if (energy_aware() && rq->misfit_task) {
|
|
+ if (rq->curr->state != TASK_RUNNING ||
|
|
+ rq->curr->nr_cpus_allowed == 1)
|
|
+ return;
|
|
+
|
|
+ new_cpu = select_energy_cpu_brute(p, cpu, 0);
|
|
+ if (capacity_orig_of(new_cpu) > capacity_orig_of(cpu)) {
|
|
+ active_balance = kick_active_balance(rq, p, new_cpu);
|
|
+ if (active_balance)
|
|
+ stop_one_cpu_nowait(cpu,
|
|
+ active_load_balance_cpu_stop,
|
|
+ rq, &rq->active_balance_work);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
#endif /* CONFIG_SMP */
|
|
|
|
/*
|
|
@@ -8559,6 +10423,16 @@ static void task_tick_fair(struct rq *rq
|
|
|
|
if (static_branch_unlikely(&sched_numa_balancing))
|
|
task_tick_numa(rq, curr);
|
|
+
|
|
+#ifdef CONFIG_SMP
|
|
+ if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
|
|
+ rq->rd->overutilized = true;
|
|
+ trace_sched_overutilized(true);
|
|
+ }
|
|
+
|
|
+ rq->misfit_task = !task_fits_max(curr, rq->cpu);
|
|
+#endif
|
|
+
|
|
}
|
|
|
|
/*
|
|
@@ -8645,32 +10519,45 @@ static inline bool vruntime_normalized(s
|
|
return false;
|
|
}
|
|
|
|
-static void detach_task_cfs_rq(struct task_struct *p)
|
|
+#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
+/*
|
|
+ * Propagate the changes of the sched_entity across the tg tree to make it
|
|
+ * visible to the root
|
|
+ */
|
|
+static void propagate_entity_cfs_rq(struct sched_entity *se)
|
|
{
|
|
- struct sched_entity *se = &p->se;
|
|
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
- u64 now = cfs_rq_clock_task(cfs_rq);
|
|
+ struct cfs_rq *cfs_rq;
|
|
|
|
- if (!vruntime_normalized(p)) {
|
|
- /*
|
|
- * Fix up our vruntime so that the current sleep doesn't
|
|
- * cause 'unlimited' sleep bonus.
|
|
- */
|
|
- place_entity(cfs_rq, se, 0);
|
|
- se->vruntime -= cfs_rq->min_vruntime;
|
|
+ /* Start to propagate at parent */
|
|
+ se = se->parent;
|
|
+
|
|
+ for_each_sched_entity(se) {
|
|
+ cfs_rq = cfs_rq_of(se);
|
|
+
|
|
+ if (cfs_rq_throttled(cfs_rq))
|
|
+ break;
|
|
+
|
|
+ update_load_avg(se, UPDATE_TG);
|
|
}
|
|
+}
|
|
+#else
|
|
+static void propagate_entity_cfs_rq(struct sched_entity *se) { }
|
|
+#endif
|
|
+
|
|
+static void detach_entity_cfs_rq(struct sched_entity *se)
|
|
+{
|
|
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
|
/* Catch up with the cfs_rq and remove our load when we leave */
|
|
- update_cfs_rq_load_avg(now, cfs_rq, false);
|
|
+ update_load_avg(se, 0);
|
|
detach_entity_load_avg(cfs_rq, se);
|
|
update_tg_load_avg(cfs_rq, false);
|
|
+ propagate_entity_cfs_rq(se);
|
|
}
|
|
|
|
-static void attach_task_cfs_rq(struct task_struct *p)
|
|
+static void attach_entity_cfs_rq(struct sched_entity *se)
|
|
{
|
|
- struct sched_entity *se = &p->se;
|
|
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
- u64 now = cfs_rq_clock_task(cfs_rq);
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
/*
|
|
@@ -8680,10 +10567,36 @@ static void attach_task_cfs_rq(struct ta
|
|
se->depth = se->parent ? se->parent->depth + 1 : 0;
|
|
#endif
|
|
|
|
- /* Synchronize task with its cfs_rq */
|
|
- update_cfs_rq_load_avg(now, cfs_rq, false);
|
|
+ /* Synchronize entity with its cfs_rq */
|
|
+ update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
|
|
attach_entity_load_avg(cfs_rq, se);
|
|
update_tg_load_avg(cfs_rq, false);
|
|
+ propagate_entity_cfs_rq(se);
|
|
+}
|
|
+
|
|
+static void detach_task_cfs_rq(struct task_struct *p)
|
|
+{
|
|
+ struct sched_entity *se = &p->se;
|
|
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
+
|
|
+ if (!vruntime_normalized(p)) {
|
|
+ /*
|
|
+ * Fix up our vruntime so that the current sleep doesn't
|
|
+ * cause 'unlimited' sleep bonus.
|
|
+ */
|
|
+ place_entity(cfs_rq, se, 0);
|
|
+ se->vruntime -= cfs_rq->min_vruntime;
|
|
+ }
|
|
+
|
|
+ detach_entity_cfs_rq(se);
|
|
+}
|
|
+
|
|
+static void attach_task_cfs_rq(struct task_struct *p)
|
|
+{
|
|
+ struct sched_entity *se = &p->se;
|
|
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
+
|
|
+ attach_entity_cfs_rq(se);
|
|
|
|
if (!vruntime_normalized(p))
|
|
se->vruntime += cfs_rq->min_vruntime;
|
|
@@ -8737,6 +10650,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
|
|
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
|
|
#endif
|
|
#ifdef CONFIG_SMP
|
|
+#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
+ cfs_rq->propagate_avg = 0;
|
|
+#endif
|
|
atomic_long_set(&cfs_rq->removed_load_avg, 0);
|
|
atomic_long_set(&cfs_rq->removed_util_avg, 0);
|
|
#endif
|
|
@@ -8845,7 +10761,8 @@ void online_fair_sched_group(struct task
|
|
se = tg->se[i];
|
|
|
|
raw_spin_lock_irq(&rq->lock);
|
|
- post_init_entity_util_avg(se);
|
|
+ update_rq_clock(rq);
|
|
+ attach_entity_cfs_rq(se);
|
|
sync_throttle(tg, i);
|
|
raw_spin_unlock_irq(&rq->lock);
|
|
}
|
|
@@ -8937,8 +10854,10 @@ int sched_group_set_shares(struct task_g
|
|
|
|
/* Possible calls to update_curr() need rq clock */
|
|
update_rq_clock(rq);
|
|
- for_each_sched_entity(se)
|
|
- update_cfs_shares(group_cfs_rq(se));
|
|
+ for_each_sched_entity(se) {
|
|
+ update_load_avg(se, UPDATE_TG);
|
|
+ update_cfs_shares(se);
|
|
+ }
|
|
raw_spin_unlock_irqrestore(&rq->lock, flags);
|
|
}
|
|
|