diff -drupN a/kernel/sched/fair.c b/kernel/sched/fair.c --- a/kernel/sched/fair.c 2018-08-06 17:23:04.000000000 +0300 +++ b/kernel/sched/fair.c 2022-06-12 05:28:14.000000000 +0300 @@ -30,10 +30,13 @@ #include #include #include +#include #include #include "sched.h" +#include "tune.h" +#include "walt.h" /* * Targeted preemption latency for CPU-bound tasks: @@ -50,6 +53,15 @@ unsigned int sysctl_sched_latency = 6000000ULL; unsigned int normalized_sysctl_sched_latency = 6000000ULL; +unsigned int sysctl_sched_sync_hint_enable = 1; +unsigned int sysctl_sched_cstate_aware = 1; + +#ifdef CONFIG_SCHED_WALT +unsigned int sysctl_sched_use_walt_cpu_util = 1; +unsigned int sysctl_sched_use_walt_task_util = 1; +__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload = + (10 * NSEC_PER_MSEC); +#endif /* * The initial- and re-scaling of tunables is configurable * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) @@ -116,7 +128,7 @@ unsigned int sysctl_sched_cfs_bandwidth_ /* * The margin used when comparing utilization with CPU capacity: - * util * 1024 < capacity * margin + * util * margin < capacity * 1024 */ unsigned int capacity_margin = 1280; /* ~20% */ @@ -290,19 +302,59 @@ static inline struct cfs_rq *group_cfs_r static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); /* * Ensure we either appear before our parent (if already * enqueued) or force our parent to appear after us when it is - * enqueued. The fact that we always enqueue bottom-up - * reduces this to two cases. + * enqueued. The fact that we always enqueue bottom-up + * reduces this to two cases and a special case for the root + * cfs_rq. Furthermore, it also means that we will always reset + * tmp_alone_branch either when the branch is connected + * to a tree or when we reach the beg of the tree */ if (cfs_rq->tg->parent && - cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq_of(cfs_rq)->leaf_cfs_rq_list); - } else { + cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { + /* + * If parent is already on the list, we add the child + * just before. Thanks to circular linked property of + * the list, this means to put the child at the tail + * of the list that starts by parent. + */ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq_of(cfs_rq)->leaf_cfs_rq_list); + &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); + /* + * The branch is now connected to its tree so we can + * reset tmp_alone_branch to the beginning of the + * list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + } else if (!cfs_rq->tg->parent) { + /* + * cfs rq without parent should be put + * at the tail of the list. + */ + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, + &rq->leaf_cfs_rq_list); + /* + * We have reach the beg of a tree so we can reset + * tmp_alone_branch to the beginning of the list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + } else { + /* + * The parent has not already been added so we want to + * make sure that it will be put after us. + * tmp_alone_branch points to the beg of the branch + * where we will add parent. + */ + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, + rq->tmp_alone_branch); + /* + * update tmp_alone_branch to points to the new beg + * of the branch + */ + rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; } cfs_rq->on_list = 1; @@ -699,6 +751,7 @@ void init_entity_runnable_average(struct if (entity_is_task(se)) sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; + /* * At this point, util_avg won't be used in select_task_rq_fair anyway */ @@ -708,9 +761,7 @@ void init_entity_runnable_average(struct } static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); -static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); -static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force); -static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); +static void attach_entity_cfs_rq(struct sched_entity *se); /* * With new tasks being created, their initial util_avgs are extrapolated @@ -742,7 +793,6 @@ void post_init_entity_util_avg(struct sc struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; - u64 now = cfs_rq_clock_task(cfs_rq); if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { @@ -770,14 +820,12 @@ void post_init_entity_util_avg(struct sc * such that the next switched_to_fair() has the * expected state. */ - se->avg.last_update_time = now; + se->avg.last_update_time = cfs_rq_clock_task(cfs_rq); return; } } - update_cfs_rq_load_avg(now, cfs_rq, false); - attach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, false); + attach_entity_cfs_rq(se); } #else /* !CONFIG_SMP */ @@ -937,6 +985,7 @@ update_stats_enqueue_sleeper(struct cfs_ } trace_sched_stat_blocked(tsk, delta); + trace_sched_blocked_reason(tsk); /* * Blocking time is in units of nanosecs, so shift by @@ -2646,16 +2695,20 @@ static void reweight_entity(struct cfs_r static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); -static void update_cfs_shares(struct cfs_rq *cfs_rq) +static void update_cfs_shares(struct sched_entity *se) { + struct cfs_rq *cfs_rq = group_cfs_rq(se); struct task_group *tg; - struct sched_entity *se; long shares; - tg = cfs_rq->tg; - se = tg->se[cpu_of(rq_of(cfs_rq))]; - if (!se || throttled_hierarchy(cfs_rq)) + if (!cfs_rq) return; + + if (throttled_hierarchy(cfs_rq)) + return; + + tg = cfs_rq->tg; + #ifndef CONFIG_SMP if (likely(se->load.weight == tg->shares)) return; @@ -2664,8 +2717,9 @@ static void update_cfs_shares(struct cfs reweight_entity(cfs_rq_of(se), se, shares); } + #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +static inline void update_cfs_shares(struct sched_entity *se) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -2816,6 +2870,7 @@ __update_load_avg(u64 now, int cpu, stru scale_freq = arch_scale_freq_capacity(NULL, cpu); scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu); /* delta_w is the amount already accumulated against our next period */ delta_w = sa->period_contrib; @@ -2891,6 +2946,26 @@ __update_load_avg(u64 now, int cpu, stru return decayed; } +/* + * Signed add and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define add_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(_val) val = (_val); \ + typeof(*ptr) res, var = READ_ONCE(*ptr); \ + \ + res = var + val; \ + \ + if (val < 0 && res > var) \ + res = 0; \ + \ + WRITE_ONCE(*ptr, res); \ +} while (0) + #ifdef CONFIG_FAIR_GROUP_SCHED /** * update_tg_load_avg - update the tg's load avg @@ -2970,8 +3045,168 @@ void set_task_rq_fair(struct sched_entit se->avg.last_update_time = n_last_update_time; } } + +/* Take into account change of utilization of a child task group */ +static inline void +update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; + + /* Nothing to update */ + if (!delta) + return; + + /* Set new sched_entity's utilization */ + se->avg.util_avg = gcfs_rq->avg.util_avg; + se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX; + + /* Update parent cfs_rq utilization */ + add_positive(&cfs_rq->avg.util_avg, delta); + cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; +} + +/* Take into account change of load of a child task group */ +static inline void +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + long delta, load = gcfs_rq->avg.load_avg; + + /* + * If the load of group cfs_rq is null, the load of the + * sched_entity will also be null so we can skip the formula + */ + if (load) { + long tg_load; + + /* Get tg's load and ensure tg_load > 0 */ + tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1; + + /* Ensure tg_load >= load and updated with current load*/ + tg_load -= gcfs_rq->tg_load_avg_contrib; + tg_load += load; + + /* + * We need to compute a correction term in the case that the + * task group is consuming more CPU than a task of equal + * weight. A task with a weight equals to tg->shares will have + * a load less or equal to scale_load_down(tg->shares). + * Similarly, the sched_entities that represent the task group + * at parent level, can't have a load higher than + * scale_load_down(tg->shares). And the Sum of sched_entities' + * load must be <= scale_load_down(tg->shares). + */ + if (tg_load > scale_load_down(gcfs_rq->tg->shares)) { + /* scale gcfs_rq's load into tg's shares*/ + load *= scale_load_down(gcfs_rq->tg->shares); + load /= tg_load; + } + } + + delta = load - se->avg.load_avg; + + /* Nothing to update */ + if (!delta) + return; + + /* Set new sched_entity's load */ + se->avg.load_avg = load; + se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX; + + /* Update parent cfs_rq load */ + add_positive(&cfs_rq->avg.load_avg, delta); + cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX; + + /* + * If the sched_entity is already enqueued, we also have to update the + * runnable load avg. + */ + if (se->on_rq) { + /* Update parent cfs_rq runnable_load_avg */ + add_positive(&cfs_rq->runnable_load_avg, delta); + cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX; + } +} + +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) +{ + cfs_rq->propagate_avg = 1; +} + +static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = group_cfs_rq(se); + + if (!cfs_rq->propagate_avg) + return 0; + + cfs_rq->propagate_avg = 0; + return 1; +} + +/* Update task and its cfs_rq load average */ +static inline int propagate_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq; + + if (entity_is_task(se)) + return 0; + + if (!test_and_clear_tg_cfs_propagate(se)) + return 0; + + cfs_rq = cfs_rq_of(se); + + set_tg_cfs_propagate(cfs_rq); + + update_tg_cfs_util(cfs_rq, se); + update_tg_cfs_load(cfs_rq, se); + + return 1; +} + +/* + * Check if we need to update the load and the utilization of a blocked + * group_entity: + */ +static inline bool skip_blocked_update(struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + + /* + * If sched_entity still have not zero load or utilization, we have to + * decay it: + */ + if (se->avg.load_avg || se->avg.util_avg) + return false; + + /* + * If there is a pending propagation, we have to update the load and + * the utilization of the sched_entity: + */ + if (gcfs_rq->propagate_avg) + return false; + + /* + * Otherwise, the load and the utilization of the sched_entity is + * already zero and there is no pending propagation, so it will be a + * waste of time to try to decay it: + */ + return true; +} + #else /* CONFIG_FAIR_GROUP_SCHED */ + static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} + +static inline int propagate_entity_load_avg(struct sched_entity *se) +{ + return 0; +} + +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} + #endif /* CONFIG_FAIR_GROUP_SCHED */ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) @@ -3042,6 +3277,7 @@ update_cfs_rq_load_avg(u64 now, struct c sub_positive(&sa->load_avg, r); sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); removed_load = 1; + set_tg_cfs_propagate(cfs_rq); } if (atomic_long_read(&cfs_rq->removed_util_avg)) { @@ -3049,6 +3285,7 @@ update_cfs_rq_load_avg(u64 now, struct c sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); removed_util = 1; + set_tg_cfs_propagate(cfs_rq); } decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, @@ -3062,27 +3299,51 @@ update_cfs_rq_load_avg(u64 now, struct c if (update_freq && (decayed || removed_util)) cfs_rq_util_change(cfs_rq); + /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */ + if (cfs_rq == &rq_of(cfs_rq)->cfs) + trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq); + return decayed || removed_load; } +/* + * Optional action to be done while updating the load average + */ +#define UPDATE_TG 0x1 +#define SKIP_AGE_LOAD 0x2 + /* Update task and its cfs_rq load average */ -static inline void update_load_avg(struct sched_entity *se, int update_tg) +static inline void update_load_avg(struct sched_entity *se, int flags) { struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); struct rq *rq = rq_of(cfs_rq); int cpu = cpu_of(rq); + int decayed; + void *ptr = NULL; /* * Track task load average for carrying it to new CPU after migrated, and * track group sched_entity load average for task_h_load calc in migration */ - __update_load_avg(now, cpu, &se->avg, + if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) { + __update_load_avg(now, cpu, &se->avg, se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); + } - if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg) + decayed = update_cfs_rq_load_avg(now, cfs_rq, true); + decayed |= propagate_entity_load_avg(se); + + if (decayed && (flags & UPDATE_TG)) update_tg_load_avg(cfs_rq, 0); + + if (entity_is_task(se)) { +#ifdef CONFIG_SCHED_WALT + ptr = (void *)&(task_of(se)->ravg); +#endif + trace_sched_load_avg_task(task_of(se), &se->avg, ptr); + } } /** @@ -3095,31 +3356,12 @@ static inline void update_load_avg(struc */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (!sched_feat(ATTACH_AGE_LOAD)) - goto skip_aging; - - /* - * If we got migrated (either between CPUs or between cgroups) we'll - * have aged the average right before clearing @last_update_time. - * - * Or we're fresh through post_init_entity_util_avg(). - */ - if (se->avg.last_update_time) { - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), - &se->avg, 0, 0, NULL); - - /* - * XXX: we could have just aged the entire load away if we've been - * absent from the fair class for too long. - */ - } - -skip_aging: se->avg.last_update_time = cfs_rq->avg.last_update_time; cfs_rq->avg.load_avg += se->avg.load_avg; cfs_rq->avg.load_sum += se->avg.load_sum; cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; + set_tg_cfs_propagate(cfs_rq); cfs_rq_util_change(cfs_rq); } @@ -3134,14 +3376,12 @@ skip_aging: */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), - &se->avg, se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + set_tg_cfs_propagate(cfs_rq); cfs_rq_util_change(cfs_rq); } @@ -3151,34 +3391,20 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct sched_avg *sa = &se->avg; - u64 now = cfs_rq_clock_task(cfs_rq); - int migrated, decayed; - - migrated = !sa->last_update_time; - if (!migrated) { - __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, - se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); - } - - decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated); cfs_rq->runnable_load_avg += sa->load_avg; cfs_rq->runnable_load_sum += sa->load_sum; - if (migrated) + if (!sa->last_update_time) { attach_entity_load_avg(cfs_rq, se); - - if (decayed || migrated) update_tg_load_avg(cfs_rq, 0); + } } /* Remove the runnable load generated by se from cfs_rq's runnable load average */ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - update_load_avg(se, 1); - cfs_rq->runnable_load_avg = max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); cfs_rq->runnable_load_sum = @@ -3207,13 +3433,25 @@ static inline u64 cfs_rq_last_update_tim #endif /* + * Synchronize entity load avg of dequeued entity without locking + * the previous rq. + */ +void sync_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 last_update_time; + + last_update_time = cfs_rq_last_update_time(cfs_rq); + __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); +} + +/* * Task first catches up with cfs_rq, and then subtract * itself from the cfs_rq (task must be off the queue now). */ void remove_entity_load_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 last_update_time; /* * tasks cannot exit without having gone through wake_up_new_task() -> @@ -3225,9 +3463,7 @@ void remove_entity_load_avg(struct sched * calls this. */ - last_update_time = cfs_rq_last_update_time(cfs_rq); - - __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); + sync_entity_load_avg(se); atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } @@ -3252,7 +3488,10 @@ update_cfs_rq_load_avg(u64 now, struct c return 0; } -static inline void update_load_avg(struct sched_entity *se, int not_used) +#define UPDATE_TG 0x0 +#define SKIP_AGE_LOAD 0x0 + +static inline void update_load_avg(struct sched_entity *se, int not_used1) { cpufreq_update_util(rq_of(cfs_rq_of(se)), 0); } @@ -3397,9 +3636,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, st if (renorm && !curr) se->vruntime += cfs_rq->min_vruntime; + /* + * When enqueuing a sched_entity, we must: + * - Update loads to have both entity and cfs_rq synced with now. + * - Add its load to cfs_rq->runnable_avg + * - For group_entity, update its weight to reflect the new share of + * its group cfs_rq + * - Add its new weight to cfs_rq->load.weight + */ + update_load_avg(se, UPDATE_TG); enqueue_entity_load_avg(cfs_rq, se); + update_cfs_shares(se); account_entity_enqueue(cfs_rq, se); - update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) place_entity(cfs_rq, se, 0); @@ -3471,6 +3719,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, st * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + + /* + * When dequeuing a sched_entity, we must: + * - Update loads to have both entity and cfs_rq synced with now. + * - Substract its load from the cfs_rq->runnable_avg. + * - Substract its previous weight from cfs_rq->load.weight. + * - For group entity, update its weight to reflect the new share + * of its group cfs_rq. + */ + update_load_avg(se, UPDATE_TG); dequeue_entity_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se, flags); @@ -3494,7 +3752,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, st /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); - update_cfs_shares(cfs_rq); + update_cfs_shares(se); /* * Now advance min_vruntime if @se was the entity holding it back, @@ -3558,7 +3816,7 @@ set_next_entity(struct cfs_rq *cfs_rq, s */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); - update_load_avg(se, 1); + update_load_avg(se, UPDATE_TG); } update_stats_curr_start(cfs_rq, se); @@ -3676,8 +3934,8 @@ entity_tick(struct cfs_rq *cfs_rq, struc /* * Ensure that runnable average is periodically updated. */ - update_load_avg(curr, 1); - update_cfs_shares(cfs_rq); + update_load_avg(curr, UPDATE_TG); + update_cfs_shares(curr); #ifdef CONFIG_SCHED_HRTICK /* @@ -4528,6 +4786,14 @@ static inline void hrtick_update(struct } #endif +#ifdef CONFIG_SMP +static bool __cpu_overutilized(int cpu, int delta); +static bool cpu_overutilized(int cpu); +unsigned long boosted_cpu_util(int cpu); +#else +#define boosted_cpu_util(cpu) cpu_util_freq(cpu) +#endif + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -4538,6 +4804,9 @@ enqueue_task_fair(struct rq *rq, struct { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; +#ifdef CONFIG_SMP + int task_new = flags & ENQUEUE_WAKEUP_NEW; +#endif /* * If in_iowait is set, the code below may not trigger any cpufreq @@ -4562,6 +4831,7 @@ enqueue_task_fair(struct rq *rq, struct if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; + walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); flags = ENQUEUE_WAKEUP; } @@ -4569,17 +4839,49 @@ enqueue_task_fair(struct rq *rq, struct for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running++; + walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, 1); - update_cfs_shares(cfs_rq); + update_load_avg(se, UPDATE_TG); + update_cfs_shares(se); } if (!se) add_nr_running(rq, 1); +#ifdef CONFIG_SMP + + /* + * Update SchedTune accounting. + * + * We do it before updating the CPU capacity to ensure the + * boost value of the current task is accounted for in the + * selection of the OPP. + * + * We do it also in the case where we enqueue a throttled task; + * we could argue that a throttled task should not boost a CPU, + * however: + * a) properly implementing CPU boosting considering throttled + * tasks will increase a lot the complexity of the solution + * b) it's not easy to quantify the benefits introduced by + * such a more complex solution. + * Thus, for the time being we go for the simple solution and boost + * also for throttled RQs. + */ + schedtune_enqueue_task(p, cpu_of(rq)); + + if (!se) { + walt_inc_cumulative_runnable_avg(rq, p); + if (!task_new && !rq->rd->overutilized && + cpu_overutilized(rq->cpu)) { + rq->rd->overutilized = true; + trace_sched_overutilized(true); + } + } + +#endif /* CONFIG_SMP */ hrtick_update(rq); } @@ -4609,6 +4911,7 @@ static void dequeue_task_fair(struct rq if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running--; + walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { @@ -4628,17 +4931,33 @@ static void dequeue_task_fair(struct rq for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; + walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, 1); - update_cfs_shares(cfs_rq); + update_load_avg(se, UPDATE_TG); + update_cfs_shares(se); } if (!se) sub_nr_running(rq, 1); +#ifdef CONFIG_SMP + + /* + * Update SchedTune accounting + * + * We do it before updating the CPU capacity to ensure the + * boost value of the current task is accounted for in the + * selection of the OPP. + */ + schedtune_dequeue_task(p, cpu_of(rq)); + + if (!se) + walt_dec_cumulative_runnable_avg(rq, p); +#endif /* CONFIG_SMP */ + hrtick_update(rq); } @@ -4945,15 +5264,6 @@ static unsigned long target_load(int cpu return max(rq->cpu_load[type-1], total); } -static unsigned long capacity_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity; -} - -static unsigned long capacity_orig_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig; -} static unsigned long cpu_avg_load_per_task(int cpu) { @@ -5105,6 +5415,532 @@ static void record_wakee(struct task_str } /* + * Returns the current capacity of cpu after applying both + * cpu and freq scaling. + */ +unsigned long capacity_curr_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig * + arch_scale_freq_capacity(NULL, cpu) + >> SCHED_CAPACITY_SHIFT; +} + +/* + * Returns the current capacity of cpu after applying both + * cpu and min freq scaling. + */ +unsigned long capacity_min_of(int cpu) +{ + if (!sched_feat(MIN_CAPACITY_CAPPING)) + return 0; + return arch_scale_cpu_capacity(NULL, cpu) * + arch_scale_min_freq_capacity(NULL, cpu) + >> SCHED_CAPACITY_SHIFT; +} + + +static inline bool energy_aware(void) +{ + return sched_feat(ENERGY_AWARE); +} + +/* + * CPU candidates. + * + * These are labels to reference CPU candidates for an energy_diff. + * Currently we support only two possible candidates: the task's previous CPU + * and another candiate CPU. + * More advanced/aggressive EAS selection policies can consider more + * candidates. + */ +#define EAS_CPU_PRV 0 +#define EAS_CPU_NXT 1 +#define EAS_CPU_BKP 2 +#define EAS_CPU_CNT 3 + +/* + * energy_diff - supports the computation of the estimated energy impact in + * moving a "task"'s "util_delta" between different CPU candidates. + */ +struct energy_env { + /* Utilization to move */ + struct task_struct *p; + int util_delta; + + /* Mask of CPUs candidates to evaluate */ + cpumask_t cpus_mask; + + /* CPU candidates to evaluate */ + struct { + + /* CPU ID, must be in cpus_mask */ + int cpu_id; + + /* + * Index (into sched_group_energy::cap_states) of the OPP the + * CPU needs to run at if the task is placed on it. + * This includes the both active and blocked load, due to + * other tasks on this CPU, as well as the task's own + * utilization. + */ + int cap_idx; + int cap; + + /* Estimated system energy */ + unsigned int energy; + + /* Estimated energy variation wrt EAS_CPU_PRV */ + int nrg_delta; + + } cpu[EAS_CPU_CNT]; + + /* + * Index (into energy_env::cpu) of the morst energy efficient CPU for + * the specified energy_env::task + */ + int next_idx; + + /* Support data */ + struct sched_group *sg_top; + struct sched_group *sg_cap; + struct sched_group *sg; +}; + +static int cpu_util_wake(int cpu, struct task_struct *p); + +/* + * __cpu_norm_util() returns the cpu util relative to a specific capacity, + * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE], which is useful for + * energy calculations. + * + * Since util is a scale-invariant utilization defined as: + * + * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time + * + * the normalized util can be found using the specific capacity. + * + * capacity = capacity_orig * curr_freq/max_freq + * + * norm_util = running_time/time ~ util/capacity + */ +static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity) +{ + if (util >= capacity) + return SCHED_CAPACITY_SCALE; + + return (util << SCHED_CAPACITY_SHIFT)/capacity; +} + +static unsigned long group_max_util(struct energy_env *eenv, int cpu_idx) +{ + unsigned long max_util = 0; + unsigned long util; + int cpu; + + for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) { + util = cpu_util_wake(cpu, eenv->p); + + /* + * If we are looking at the target CPU specified by the eenv, + * then we should add the (estimated) utilization of the task + * assuming we will wake it up on that CPU. + */ + if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id)) + util += eenv->util_delta; + + max_util = max(max_util, util); + + /* + * Take into account any minimum frequency imposed + * elsewhere which limits the energy states available + * If the MIN_CAPACITY_CAPPING feature is not enabled + * capacity_min_of will return 0 (not capped). + */ + max_util = max(max_util, capacity_min_of(cpu)); + + } + + return max_util; +} + +/* + * group_norm_util() returns the approximated group util relative to it's + * current capacity (busy ratio), in the range [0..SCHED_LOAD_SCALE], for use + * in energy calculations. + * + * Since task executions may or may not overlap in time in the group the true + * normalized util is between MAX(cpu_norm_util(i)) and SUM(cpu_norm_util(i)) + * when iterating over all CPUs in the group. + * The latter estimate is used as it leads to a more pessimistic energy + * estimate (more busy). + */ +static unsigned +long group_norm_util(struct energy_env *eenv, int cpu_idx) +{ + unsigned long capacity = eenv->cpu[cpu_idx].cap; + unsigned long util, util_sum = 0; + int cpu; + + for_each_cpu(cpu, sched_group_cpus(eenv->sg)) { + util = cpu_util_wake(cpu, eenv->p); + + /* + * If we are looking at the target CPU specified by the eenv, + * then we should add the (estimated) utilization of the task + * assuming we will wake it up on that CPU. + */ + if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id)) + util += eenv->util_delta; + + util_sum += __cpu_norm_util(util, capacity); + } + + return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE); +} + +static int find_new_capacity(struct energy_env *eenv, int cpu_idx) +{ + const struct sched_group_energy *sge = eenv->sg->sge; + int idx, max_idx = sge->nr_cap_states - 1; + unsigned long util = group_max_util(eenv, cpu_idx); + + /* default is max_cap if we don't find a match */ + eenv->cpu[cpu_idx].cap_idx = max_idx; + eenv->cpu[cpu_idx].cap = sge->cap_states[max_idx].cap; + + for (idx = 0; idx < sge->nr_cap_states; idx++) { + if (sge->cap_states[idx].cap >= util) { + /* Keep track of SG's capacity */ + eenv->cpu[cpu_idx].cap_idx = idx; + eenv->cpu[cpu_idx].cap = sge->cap_states[idx].cap; + break; + } + } + + return eenv->cpu[cpu_idx].cap_idx; +} + +static int group_idle_state(struct energy_env *eenv, int cpu_idx) +{ + struct sched_group *sg = eenv->sg; + int i, state = INT_MAX; + int src_in_grp, dst_in_grp; + long grp_util = 0; + + /* Find the shallowest idle state in the sched group. */ + for_each_cpu(i, sched_group_cpus(sg)) + state = min(state, idle_get_state_idx(cpu_rq(i))); + + /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */ + state++; + + src_in_grp = cpumask_test_cpu(eenv->cpu[EAS_CPU_PRV].cpu_id, + sched_group_cpus(sg)); + dst_in_grp = cpumask_test_cpu(eenv->cpu[cpu_idx].cpu_id, + sched_group_cpus(sg)); + if (src_in_grp == dst_in_grp) { + /* both CPUs under consideration are in the same group or not in + * either group, migration should leave idle state the same. + */ + goto end; + } + + /* + * Try to estimate if a deeper idle state is + * achievable when we move the task. + */ + for_each_cpu(i, sched_group_cpus(sg)) { + grp_util += cpu_util_wake(i, eenv->p); + if (unlikely(i == eenv->cpu[cpu_idx].cpu_id)) + grp_util += eenv->util_delta; + } + + if (grp_util <= + ((long)sg->sgc->max_capacity * (int)sg->group_weight)) { + /* after moving, this group is at most partly + * occupied, so it should have some idle time. + */ + int max_idle_state_idx = sg->sge->nr_idle_states - 2; + int new_state = grp_util * max_idle_state_idx; + if (grp_util <= 0) + /* group will have no util, use lowest state */ + new_state = max_idle_state_idx + 1; + else { + /* for partially idle, linearly map util to idle + * states, excluding the lowest one. This does not + * correspond to the state we expect to enter in + * reality, but an indication of what might happen. + */ + new_state = min(max_idle_state_idx, (int) + (new_state / sg->sgc->max_capacity)); + new_state = max_idle_state_idx - new_state; + } + state = new_state; + } else { + /* After moving, the group will be fully occupied + * so assume it will not be idle at all. + */ + state = 0; + } +end: + return state; +} + +/* + * calc_sg_energy: compute energy for the eenv's SG (i.e. eenv->sg). + * + * This works in iterations to compute the SG's energy for each CPU + * candidate defined by the energy_env's cpu array. + * + * NOTE: in the following computations for busy_energy and idle_energy we do + * not shift by SCHED_CAPACITY_SHIFT in order to reduce rounding errors. + * The required scaling will be performed just one time, by the calling + * functions, once we accumulated the contributons for all the SGs. + */ +static void calc_sg_energy(struct energy_env *eenv) +{ + struct sched_group *sg = eenv->sg; + int busy_energy, idle_energy; + unsigned int busy_power; + unsigned int idle_power; + unsigned long sg_util; + int cap_idx, idle_idx; + int total_energy = 0; + int cpu_idx; + + for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) { + + + if (eenv->cpu[cpu_idx].cpu_id == -1) + continue; + /* Compute ACTIVE energy */ + cap_idx = find_new_capacity(eenv, cpu_idx); + busy_power = sg->sge->cap_states[cap_idx].power; + /* + * in order to calculate cpu_norm_util, we need to know which + * capacity level the group will be at, so calculate that first + */ + sg_util = group_norm_util(eenv, cpu_idx); + + busy_energy = sg_util * busy_power; + + /* Compute IDLE energy */ + idle_idx = group_idle_state(eenv, cpu_idx); + idle_power = sg->sge->idle_states[idle_idx].power; + + idle_energy = SCHED_CAPACITY_SCALE - sg_util; + idle_energy *= idle_power; + + total_energy = busy_energy + idle_energy; + eenv->cpu[cpu_idx].energy += total_energy; + } +} + +/* + * compute_energy() computes the absolute variation in energy consumption by + * moving eenv.util_delta from EAS_CPU_PRV to EAS_CPU_NXT. + * + * NOTE: compute_energy() may fail when racing with sched_domain updates, in + * which case we abort by returning -EINVAL. + */ +static int compute_energy(struct energy_env *eenv) +{ + struct cpumask visit_cpus; + int cpu_count; + + WARN_ON(!eenv->sg_top->sge); + + cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top)); + /* If a cpu is hotplugged in while we are in this function, + * it does not appear in the existing visit_cpus mask + * which came from the sched_group pointer of the + * sched_domain pointed at by sd_ea for either the prev + * or next cpu and was dereferenced in __energy_diff. + * Since we will dereference sd_scs later as we iterate + * through the CPUs we expect to visit, new CPUs can + * be present which are not in the visit_cpus mask. + * Guard this with cpu_count. + */ + cpu_count = cpumask_weight(&visit_cpus); + + while (!cpumask_empty(&visit_cpus)) { + struct sched_group *sg_shared_cap = NULL; + int cpu = cpumask_first(&visit_cpus); + struct sched_domain *sd; + + /* + * Is the group utilization affected by cpus outside this + * sched_group? + * This sd may have groups with cpus which were not present + * when we took visit_cpus. + */ + sd = rcu_dereference(per_cpu(sd_scs, cpu)); + if (sd && sd->parent) + sg_shared_cap = sd->parent->groups; + + for_each_domain(cpu, sd) { + struct sched_group *sg = sd->groups; + + /* Has this sched_domain already been visited? */ + if (sd->child && group_first_cpu(sg) != cpu) + break; + + do { + eenv->sg_cap = sg; + if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight) + eenv->sg_cap = sg_shared_cap; + + /* + * Compute the energy for all the candidate + * CPUs in the current visited SG. + */ + eenv->sg = sg; + calc_sg_energy(eenv); + + if (!sd->child) { + /* + * cpu_count here is the number of + * cpus we expect to visit in this + * calculation. If we race against + * hotplug, we can have extra cpus + * added to the groups we are + * iterating which do not appear in + * the visit_cpus mask. In that case + * we are not able to calculate energy + * without restarting so we will bail + * out and use prev_cpu this time. + */ + if (!cpu_count) + return -EINVAL; + cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg)); + cpu_count--; + } + + if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top))) + goto next_cpu; + + } while (sg = sg->next, sg != sd->groups); + } + + /* + * If we raced with hotplug and got an sd NULL-pointer; + * returning a wrong energy estimation is better than + * entering an infinite loop. + * Specifically: If a cpu is unplugged after we took + * the visit_cpus mask, it no longer has an sd_scs + * pointer, so when we dereference it, we get NULL. + */ + if (cpumask_test_cpu(cpu, &visit_cpus)) + return -EINVAL; +next_cpu: + cpumask_clear_cpu(cpu, &visit_cpus); + continue; + } + + return 0; +} + +static inline bool cpu_in_sg(struct sched_group *sg, int cpu) +{ + return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); +} + +/* + * select_energy_cpu_idx(): estimate the energy impact of changing the + * utilization distribution. + * + * The eenv parameter specifies the changes: utilisation amount and a pair of + * possible CPU candidates (the previous CPU and a different target CPU). + * + * This function returns the index of a CPU candidate specified by the + * energy_env which corresponds to the first CPU saving energy. + * Thus, 0 (EAS_CPU_PRV) means that non of the CPU candidate is more energy + * efficient than running on prev_cpu. This is also the value returned in case + * of abort due to error conditions during the computations. + * A value greater than zero means that the first energy-efficient CPU is the + * one represented by eenv->cpu[eenv->next_idx].cpu_id. + */ +static inline int select_energy_cpu_idx(struct energy_env *eenv) +{ + struct sched_domain *sd; + struct sched_group *sg; + int sd_cpu = -1; + int cpu_idx; + int margin; + + sd_cpu = eenv->cpu[EAS_CPU_PRV].cpu_id; + sd = rcu_dereference(per_cpu(sd_ea, sd_cpu)); + if (!sd) + return EAS_CPU_PRV; + + cpumask_clear(&eenv->cpus_mask); + for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) { + int cpu = eenv->cpu[cpu_idx].cpu_id; + + if (cpu < 0) + continue; + cpumask_set_cpu(cpu, &eenv->cpus_mask); + } + + sg = sd->groups; + do { + /* Skip SGs which do not contains a candidate CPU */ + if (!cpumask_intersects(&eenv->cpus_mask, sched_group_cpus(sg))) + continue; + + eenv->sg_top = sg; + /* energy is unscaled to reduce rounding errors */ + if (compute_energy(eenv) == -EINVAL) + return EAS_CPU_PRV; + + } while (sg = sg->next, sg != sd->groups); + + /* Scale energy before comparisons */ + for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) + eenv->cpu[cpu_idx].energy >>= SCHED_CAPACITY_SHIFT; + + /* + * Compute the dead-zone margin used to prevent too many task + * migrations with negligible energy savings. + * An energy saving is considered meaningful if it reduces the energy + * consumption of EAS_CPU_PRV CPU candidate by at least ~1.56% + */ + margin = eenv->cpu[EAS_CPU_PRV].energy >> 6; + + /* + * By default the EAS_CPU_PRV CPU is considered the most energy + * efficient, with a 0 energy variation. + */ + eenv->next_idx = EAS_CPU_PRV; + + /* + * Compare the other CPU candidates to find a CPU which can be + * more energy efficient then EAS_CPU_PRV + */ + for (cpu_idx = EAS_CPU_NXT; cpu_idx < EAS_CPU_CNT; ++cpu_idx) { + /* Skip not valid scheduled candidates */ + if (eenv->cpu[cpu_idx].cpu_id < 0) + continue; + /* Compute energy delta wrt EAS_CPU_PRV */ + eenv->cpu[cpu_idx].nrg_delta = + eenv->cpu[cpu_idx].energy - + eenv->cpu[EAS_CPU_PRV].energy; + /* filter energy variations within the dead-zone margin */ + if (abs(eenv->cpu[cpu_idx].nrg_delta) < margin) + eenv->cpu[cpu_idx].nrg_delta = 0; + /* update the schedule candidate with min(nrg_delta) */ + if (eenv->cpu[cpu_idx].nrg_delta < + eenv->cpu[eenv->next_idx].nrg_delta) { + eenv->next_idx = cpu_idx; + if (sched_feat(FBT_STRICT_ORDER)) + break; + } + } + + return eenv->next_idx; +} + +/* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. * * A waker of many should wake a different task than the one last awakened @@ -5200,24 +6036,180 @@ static int wake_affine(struct sched_doma return 1; } +static inline unsigned long task_util(struct task_struct *p) +{ +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_task_util) { + unsigned long demand = p->ravg.demand; + return (demand << SCHED_CAPACITY_SHIFT) / walt_ravg_window; + } +#endif + return p->se.avg.util_avg; +} + +static inline unsigned long boosted_task_util(struct task_struct *p); + +static inline bool __task_fits(struct task_struct *p, int cpu, int util) +{ + unsigned long capacity = capacity_of(cpu); + + util += boosted_task_util(p); + + return (capacity * 1024) > (util * capacity_margin); +} + +static inline bool task_fits_max(struct task_struct *p, int cpu) +{ + unsigned long capacity = capacity_of(cpu); + unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val; + + if (capacity == max_capacity) + return true; + + if (capacity * capacity_margin > max_capacity * 1024) + return true; + + return __task_fits(p, cpu, 0); +} + +static bool __cpu_overutilized(int cpu, int delta) +{ + return (capacity_of(cpu) * 1024) < ((cpu_util(cpu) + delta) * capacity_margin); +} + +static bool cpu_overutilized(int cpu) +{ + return __cpu_overutilized(cpu, 0); +} + +#ifdef CONFIG_SCHED_TUNE + +struct reciprocal_value schedtune_spc_rdiv; + +static long +schedtune_margin(unsigned long signal, long boost) +{ + long long margin = 0; + + /* + * Signal proportional compensation (SPC) + * + * The Boost (B) value is used to compute a Margin (M) which is + * proportional to the complement of the original Signal (S): + * M = B * (SCHED_CAPACITY_SCALE - S) + * The obtained M could be used by the caller to "boost" S. + */ + if (boost >= 0) { + margin = SCHED_CAPACITY_SCALE - signal; + margin *= boost; + } else { + margin = -signal * boost; + } + + margin = reciprocal_divide(margin, schedtune_spc_rdiv); + if (boost < 0) + margin *= -1; + + return margin; +} + +static inline int +schedtune_cpu_margin(unsigned long util, int cpu) +{ + int boost = schedtune_cpu_boost(cpu); + + if (boost == 0) + return 0; + + return schedtune_margin(util, boost); +} + +static inline long +schedtune_task_margin(struct task_struct *p) +{ + int boost = schedtune_task_boost(p); + unsigned long util; + long margin; + + if (boost == 0) + return 0; + + util = task_util(p); + margin = schedtune_margin(util, boost); + + return margin; +} + +#else /* CONFIG_SCHED_TUNE */ + +static inline int +schedtune_cpu_margin(unsigned long util, int cpu) +{ + return 0; +} + +static inline int +schedtune_task_margin(struct task_struct *p) +{ + return 0; +} + +#endif /* CONFIG_SCHED_TUNE */ + +unsigned long +boosted_cpu_util(int cpu) +{ + unsigned long util = cpu_util_freq(cpu); + long margin = schedtune_cpu_margin(util, cpu); + + trace_sched_boost_cpu(cpu, util, margin); + + return util + margin; +} + +static inline unsigned long +boosted_task_util(struct task_struct *p) +{ + unsigned long util = task_util(p); + long margin = schedtune_task_margin(p); + + trace_sched_boost_task(p, util, margin); + + return util + margin; +} + +static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) +{ + return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0); +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. + * + * Assumes p is allowed on at least one CPU in sd. */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; - unsigned long min_load = ULONG_MAX, this_load = 0; + struct sched_group *most_spare_sg = NULL; + unsigned long min_runnable_load = ULONG_MAX; + unsigned long this_runnable_load = ULONG_MAX; + unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; + unsigned long most_spare = 0, this_spare = 0; int load_idx = sd->forkexec_idx; - int imbalance = 100 + (sd->imbalance_pct-100)/2; + int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; + unsigned long imbalance = scale_load_down(NICE_0_LOAD) * + (sd->imbalance_pct-100) / 100; if (sd_flag & SD_BALANCE_WAKE) load_idx = sd->wake_idx; do { - unsigned long load, avg_load; + unsigned long load, avg_load, runnable_load; + unsigned long spare_cap, max_spare_cap; int local_group; int i; @@ -5229,8 +6221,13 @@ find_idlest_group(struct sched_domain *s local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(group)); - /* Tally up the load of all CPUs in the group */ + /* + * Tally up the load of all CPUs in the group and find + * the group containing the CPU with most spare capacity. + */ avg_load = 0; + runnable_load = 0; + max_spare_cap = 0; for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ @@ -5239,30 +6236,85 @@ find_idlest_group(struct sched_domain *s else load = target_load(i, load_idx); - avg_load += load; + runnable_load += load; + + avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); + + spare_cap = capacity_spare_wake(i, p); + + if (spare_cap > max_spare_cap) + max_spare_cap = spare_cap; } /* Adjust by relative CPU capacity of the group */ - avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity; + avg_load = (avg_load * SCHED_CAPACITY_SCALE) / + group->sgc->capacity; + runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) / + group->sgc->capacity; if (local_group) { - this_load = avg_load; - } else if (avg_load < min_load) { - min_load = avg_load; - idlest = group; + this_runnable_load = runnable_load; + this_avg_load = avg_load; + this_spare = max_spare_cap; + } else { + if (min_runnable_load > (runnable_load + imbalance)) { + /* + * The runnable load is significantly smaller + * so we can pick this new cpu + */ + min_runnable_load = runnable_load; + min_avg_load = avg_load; + idlest = group; + } else if ((runnable_load < (min_runnable_load + imbalance)) && + (100*min_avg_load > imbalance_scale*avg_load)) { + /* + * The runnable loads are close so we take + * into account blocked load through avg_load + * which is blocked + runnable load + */ + min_avg_load = avg_load; + idlest = group; + } + + if (most_spare < max_spare_cap) { + most_spare = max_spare_cap; + most_spare_sg = group; + } } } while (group = group->next, group != sd->groups); - if (!idlest || 100*this_load < imbalance*min_load) + /* + * The cross-over point between using spare capacity or least load + * is too conservative for high utilization tasks on partially + * utilized systems if we require spare_capacity > task_util(p), + * so we allow for some task stuffing by using + * spare_capacity > task_util(p)/2. + * spare capacity can't be used for fork because the utilization has + * not been set yet as it need to get a rq to init the utilization + */ + if (sd_flag & SD_BALANCE_FORK) + goto skip_spare; + + if (this_spare > task_util(p) / 2 && + imbalance_scale*this_spare > 100*most_spare) + return NULL; + else if (most_spare > task_util(p) / 2) + return most_spare_sg; + +skip_spare: + if (!idlest || + (min_runnable_load > (this_runnable_load + imbalance)) || + ((this_runnable_load < (min_runnable_load + imbalance)) && + (100*this_avg_load < imbalance_scale*min_avg_load))) return NULL; return idlest; } /* - * find_idlest_cpu - find the idlest cpu among the cpus in group. + * find_idlest_group_cpu - find the idlest cpu among the cpus in group. */ static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; unsigned int min_exit_latency = UINT_MAX; @@ -5311,6 +6363,68 @@ find_idlest_cpu(struct sched_group *grou return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } +static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, + int cpu, int prev_cpu, int sd_flag) +{ + int wu = sd_flag & SD_BALANCE_WAKE; + int cas_cpu = -1; + int new_cpu = cpu; + + if (wu) { + schedstat_inc(p->se.statistics.nr_wakeups_cas_attempts); + schedstat_inc(this_rq()->eas_stats.cas_attempts); + } + + if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) + return prev_cpu; + + while (sd) { + struct sched_group *group; + struct sched_domain *tmp; + int weight; + + if (wu) + schedstat_inc(sd->eas_stats.cas_attempts); + + if (!(sd->flags & sd_flag)) { + sd = sd->child; + continue; + } + + group = find_idlest_group(sd, p, cpu, sd_flag); + if (!group) { + sd = sd->child; + continue; + } + + new_cpu = find_idlest_group_cpu(group, p, cpu); + if (new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } + + /* Now try balancing at a lower domain level of new_cpu */ + cpu = cas_cpu = new_cpu; + weight = sd->span_weight; + sd = NULL; + for_each_domain(cpu, tmp) { + if (weight <= tmp->span_weight) + break; + if (tmp->flags & sd_flag) + sd = tmp; + } + /* while loop will break here if sd == NULL */ + } + + if (wu && (cas_cpu >= 0)) { + schedstat_inc(p->se.statistics.nr_wakeups_cas_count); + schedstat_inc(this_rq()->eas_stats.cas_count); + } + + return new_cpu; +} + #ifdef CONFIG_SCHED_SMT static inline void set_idle_cores(int cpu, int val) @@ -5478,96 +6592,583 @@ static int select_idle_cpu(struct task_s static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; - int i; + struct sched_group *sg; + int i = task_cpu(p); + int best_idle_cpu = -1; + int best_idle_cstate = INT_MAX; + unsigned long best_idle_capacity = ULONG_MAX; - if (idle_cpu(target)) - return target; + schedstat_inc(p->se.statistics.nr_wakeups_sis_attempts); + schedstat_inc(this_rq()->eas_stats.sis_attempts); + + if (!sysctl_sched_cstate_aware) { + if (idle_cpu(target)) { + schedstat_inc(p->se.statistics.nr_wakeups_sis_idle); + schedstat_inc(this_rq()->eas_stats.sis_idle); + return target; + } + + /* + * If the prevous cpu is cache affine and idle, don't be stupid. + */ + if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) { + schedstat_inc(p->se.statistics.nr_wakeups_sis_cache_affine); + schedstat_inc(this_rq()->eas_stats.sis_cache_affine); + return i; + } + + sd = rcu_dereference(per_cpu(sd_llc, target)); + if (!sd) + return target; + + i = select_idle_core(p, sd, target); + if ((unsigned)i < nr_cpumask_bits) + return i; + + i = select_idle_cpu(p, sd, target); + if ((unsigned)i < nr_cpumask_bits) + return i; + + i = select_idle_smt(p, sd, target); + if ((unsigned)i < nr_cpumask_bits) + return i; + } /* - * If the previous cpu is cache affine and idle, don't be stupid. + * Otherwise, iterate the domains and find an elegible idle cpu. */ - if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) - return prev; - sd = rcu_dereference(per_cpu(sd_llc, target)); - if (!sd) - return target; + for_each_lower_domain(sd) { + sg = sd->groups; + do { + if (!cpumask_intersects(sched_group_cpus(sg), + tsk_cpus_allowed(p))) + goto next; - i = select_idle_core(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) - return i; - i = select_idle_cpu(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) - return i; + if (sysctl_sched_cstate_aware) { + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { + int idle_idx = idle_get_state_idx(cpu_rq(i)); + unsigned long new_usage = boosted_task_util(p); + unsigned long capacity_orig = capacity_orig_of(i); - i = select_idle_smt(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) - return i; + if (new_usage > capacity_orig || !idle_cpu(i)) + goto next; + + if (i == target && new_usage <= capacity_curr_of(target)) { + schedstat_inc(p->se.statistics.nr_wakeups_sis_suff_cap); + schedstat_inc(this_rq()->eas_stats.sis_suff_cap); + schedstat_inc(sd->eas_stats.sis_suff_cap); + return target; + } + + if (idle_idx < best_idle_cstate && + capacity_orig <= best_idle_capacity) { + best_idle_cpu = i; + best_idle_cstate = idle_idx; + best_idle_capacity = capacity_orig; + } + } + } else { + for_each_cpu(i, sched_group_cpus(sg)) { + if (i == target || !idle_cpu(i)) + goto next; + } + + target = cpumask_first_and(sched_group_cpus(sg), + tsk_cpus_allowed(p)); + schedstat_inc(p->se.statistics.nr_wakeups_sis_idle_cpu); + schedstat_inc(this_rq()->eas_stats.sis_idle_cpu); + schedstat_inc(sd->eas_stats.sis_idle_cpu); + goto done; + } +next: + sg = sg->next; + } while (sg != sd->groups); + } + + if (best_idle_cpu >= 0) + target = best_idle_cpu; + +done: + schedstat_inc(p->se.statistics.nr_wakeups_sis_count); + schedstat_inc(this_rq()->eas_stats.sis_count); return target; } - + /* - * cpu_util returns the amount of capacity of a CPU that is used by CFS - * tasks. The unit of the return value must be the one of capacity so we can - * compare the utilization with the capacity of the CPU that is available for - * CFS task (ie cpu_capacity). - * - * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the - * recent utilization of currently non-runnable tasks on a CPU. It represents - * the amount of utilization of a CPU in the range [0..capacity_orig] where - * capacity_orig is the cpu_capacity available at the highest frequency - * (arch_scale_freq_capacity()). - * The utilization of a CPU converges towards a sum equal to or less than the - * current capacity (capacity_curr <= capacity_orig) of the CPU because it is - * the running time on this CPU scaled by capacity_curr. - * - * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even - * higher than capacity_orig because of unfortunate rounding in - * cfs.avg.util_avg or just after migrating tasks and new task wakeups until - * the average stabilizes with the new running time. We need to check that the - * utilization stays within the range of [0..capacity_orig] and cap it if - * necessary. Without utilization capping, a group could be seen as overloaded - * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of - * available capacity. We allow utilization to overshoot capacity_curr (but not - * capacity_orig) as it useful for predicting the capacity required after task - * migrations (scheduler-driven DVFS). + * cpu_util_wake: Compute cpu utilization with any contributions from + * the waking task p removed. check_for_migration() looks for a better CPU of + * rq->curr. For that case we should return cpu util with contributions from + * currently running task p removed. */ -static int cpu_util(int cpu) +static int cpu_util_wake(int cpu, struct task_struct *p) { - unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; - unsigned long capacity = capacity_orig_of(cpu); + unsigned long util, capacity; + +#ifdef CONFIG_SCHED_WALT + /* + * WALT does not decay idle tasks in the same manner + * as PELT, so it makes little sense to subtract task + * utilization from cpu utilization. Instead just use + * cpu_util for this case. + */ + if (!walt_disabled && sysctl_sched_use_walt_cpu_util && + p->state == TASK_WAKING) + return cpu_util(cpu); +#endif + /* Task has no contribution or is new */ + if (cpu != task_cpu(p) || !p->se.avg.last_update_time) + return cpu_util(cpu); + + capacity = capacity_orig_of(cpu); + util = max_t(long, cpu_util(cpu) - task_util(p), 0); return (util >= capacity) ? capacity : util; } -static inline int task_util(struct task_struct *p) +static int start_cpu(bool boosted) { - return p->se.avg.util_avg; + struct root_domain *rd = cpu_rq(smp_processor_id())->rd; + + return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu; +} + +static inline int find_best_target(struct task_struct *p, int *backup_cpu, + bool boosted, bool prefer_idle) +{ + unsigned long best_idle_min_cap_orig = ULONG_MAX; + unsigned long min_util = boosted_task_util(p); + unsigned long target_capacity = ULONG_MAX; + unsigned long min_wake_util = ULONG_MAX; + unsigned long target_max_spare_cap = 0; + unsigned long target_util = ULONG_MAX; + unsigned long best_active_util = ULONG_MAX; + unsigned long target_idle_max_spare_cap = 0; + int best_idle_cstate = INT_MAX; + struct sched_domain *sd; + struct sched_group *sg; + int best_active_cpu = -1; + int best_idle_cpu = -1; + int target_cpu = -1; + int cpu, i; + + *backup_cpu = -1; + + schedstat_inc(p->se.statistics.nr_wakeups_fbt_attempts); + schedstat_inc(this_rq()->eas_stats.fbt_attempts); + + /* Find start CPU based on boost value */ + cpu = start_cpu(boosted); + if (cpu < 0) { + schedstat_inc(p->se.statistics.nr_wakeups_fbt_no_cpu); + schedstat_inc(this_rq()->eas_stats.fbt_no_cpu); + return -1; + } + + /* Find SD for the start CPU */ + sd = rcu_dereference(per_cpu(sd_ea, cpu)); + if (!sd) { + schedstat_inc(p->se.statistics.nr_wakeups_fbt_no_sd); + schedstat_inc(this_rq()->eas_stats.fbt_no_sd); + return -1; + } + + /* Scan CPUs in all SDs */ + sg = sd->groups; + do { + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { + unsigned long capacity_curr = capacity_curr_of(i); + unsigned long capacity_orig = capacity_orig_of(i); + unsigned long wake_util, new_util, min_capped_util; + + if (!cpu_online(i)) + continue; + + if (walt_cpu_high_irqload(i)) + continue; + + /* + * p's blocked utilization is still accounted for on prev_cpu + * so prev_cpu will receive a negative bias due to the double + * accounting. However, the blocked utilization may be zero. + */ + wake_util = cpu_util_wake(i, p); + new_util = wake_util + task_util(p); + + /* + * Ensure minimum capacity to grant the required boost. + * The target CPU can be already at a capacity level higher + * than the one required to boost the task. + */ + new_util = max(min_util, new_util); + + /* + * Include minimum capacity constraint: + * new_util contains the required utilization including + * boost. min_capped_util also takes into account a + * minimum capacity cap imposed on the CPU by external + * actors. + */ + min_capped_util = max(new_util, capacity_min_of(i)); + + if (new_util > capacity_orig) + continue; + + /* + * Case A) Latency sensitive tasks + * + * Unconditionally favoring tasks that prefer idle CPU to + * improve latency. + * + * Looking for: + * - an idle CPU, whatever its idle_state is, since + * the first CPUs we explore are more likely to be + * reserved for latency sensitive tasks. + * - a non idle CPU where the task fits in its current + * capacity and has the maximum spare capacity. + * - a non idle CPU with lower contention from other + * tasks and running at the lowest possible OPP. + * + * The last two goals tries to favor a non idle CPU + * where the task can run as if it is "almost alone". + * A maximum spare capacity CPU is favoured since + * the task already fits into that CPU's capacity + * without waiting for an OPP chance. + * + * The following code path is the only one in the CPUs + * exploration loop which is always used by + * prefer_idle tasks. It exits the loop with wither a + * best_active_cpu or a target_cpu which should + * represent an optimal choice for latency sensitive + * tasks. + */ + if (prefer_idle) { + + /* + * Case A.1: IDLE CPU + * Return the first IDLE CPU we find. + */ + if (idle_cpu(i)) { + schedstat_inc(p->se.statistics.nr_wakeups_fbt_pref_idle); + schedstat_inc(this_rq()->eas_stats.fbt_pref_idle); + + trace_sched_find_best_target(p, + prefer_idle, min_util, + cpu, best_idle_cpu, + best_active_cpu, i); + + return i; + } + + /* + * Case A.2: Target ACTIVE CPU + * Favor CPUs with max spare capacity. + */ + if ((capacity_curr > new_util) && + (capacity_orig - new_util > target_max_spare_cap)) { + target_max_spare_cap = capacity_orig - new_util; + target_cpu = i; + continue; + } + if (target_cpu != -1) + continue; + + + /* + * Case A.3: Backup ACTIVE CPU + * Favor CPUs with: + * - lower utilization due to other tasks + * - lower utilization with the task in + */ + if (wake_util > min_wake_util) + continue; + if (new_util > best_active_util) + continue; + min_wake_util = wake_util; + best_active_util = new_util; + best_active_cpu = i; + continue; + } + + /* + * Enforce EAS mode + * + * For non latency sensitive tasks, skip CPUs that + * will be overutilized by moving the task there. + * + * The goal here is to remain in EAS mode as long as + * possible at least for !prefer_idle tasks. + */ + if ((new_util * capacity_margin) > + (capacity_orig * SCHED_CAPACITY_SCALE)) + continue; + + /* + * Case B) Non latency sensitive tasks on IDLE CPUs. + * + * Find an optimal backup IDLE CPU for non latency + * sensitive tasks. + * + * Looking for: + * - minimizing the capacity_orig, + * i.e. preferring LITTLE CPUs + * - favoring shallowest idle states + * i.e. avoid to wakeup deep-idle CPUs + * + * The following code path is used by non latency + * sensitive tasks if IDLE CPUs are available. If at + * least one of such CPUs are available it sets the + * best_idle_cpu to the most suitable idle CPU to be + * selected. + * + * If idle CPUs are available, favour these CPUs to + * improve performances by spreading tasks. + * Indeed, the energy_diff() computed by the caller + * will take care to ensure the minimization of energy + * consumptions without affecting performance. + */ + if (idle_cpu(i)) { + int idle_idx = idle_get_state_idx(cpu_rq(i)); + + /* Select idle CPU with lower cap_orig */ + if (capacity_orig > best_idle_min_cap_orig) + continue; + /* Favor CPUs that won't end up running at a + * high OPP. + */ + if ((capacity_orig - min_capped_util) < + target_idle_max_spare_cap) + continue; + + /* + * Skip CPUs in deeper idle state, but only + * if they are also less energy efficient. + * IOW, prefer a deep IDLE LITTLE CPU vs a + * shallow idle big CPU. + */ + if (sysctl_sched_cstate_aware && + best_idle_cstate <= idle_idx) + continue; + + /* Keep track of best idle CPU */ + best_idle_min_cap_orig = capacity_orig; + target_idle_max_spare_cap = capacity_orig - + min_capped_util; + best_idle_cstate = idle_idx; + best_idle_cpu = i; + continue; + } + + /* + * Case C) Non latency sensitive tasks on ACTIVE CPUs. + * + * Pack tasks in the most energy efficient capacities. + * + * This task packing strategy prefers more energy + * efficient CPUs (i.e. pack on smaller maximum + * capacity CPUs) while also trying to spread tasks to + * run them all at the lower OPP. + * + * This assumes for example that it's more energy + * efficient to run two tasks on two CPUs at a lower + * OPP than packing both on a single CPU but running + * that CPU at an higher OPP. + * + * Thus, this case keep track of the CPU with the + * smallest maximum capacity and highest spare maximum + * capacity. + */ + + /* Favor CPUs with smaller capacity */ + if (capacity_orig > target_capacity) + continue; + + /* Favor CPUs with maximum spare capacity */ + if ((capacity_orig - min_capped_util) < + target_max_spare_cap) + continue; + + target_max_spare_cap = capacity_orig - min_capped_util; + target_capacity = capacity_orig; + target_util = new_util; + target_cpu = i; + } + + } while (sg = sg->next, sg != sd->groups); + + /* + * For non latency sensitive tasks, cases B and C in the previous loop, + * we pick the best IDLE CPU only if we was not able to find a target + * ACTIVE CPU. + * + * Policies priorities: + * + * - prefer_idle tasks: + * + * a) IDLE CPU available, we return immediately + * b) ACTIVE CPU where task fits and has the bigger maximum spare + * capacity (i.e. target_cpu) + * c) ACTIVE CPU with less contention due to other tasks + * (i.e. best_active_cpu) + * + * - NON prefer_idle tasks: + * + * a) ACTIVE CPU: target_cpu + * b) IDLE CPU: best_idle_cpu + */ + if (target_cpu == -1) + target_cpu = prefer_idle + ? best_active_cpu + : best_idle_cpu; + else + *backup_cpu = prefer_idle + ? best_active_cpu + : best_idle_cpu; + + trace_sched_find_best_target(p, prefer_idle, min_util, cpu, + best_idle_cpu, best_active_cpu, + target_cpu); + + schedstat_inc(p->se.statistics.nr_wakeups_fbt_count); + schedstat_inc(this_rq()->eas_stats.fbt_count); + + return target_cpu; } /* * Disable WAKE_AFFINE in the case where task @p doesn't fit in the * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. - * + * * In that case WAKE_AFFINE doesn't make sense and we'll let * BALANCE_WAKE sort things out. */ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) { long min_cap, max_cap; - min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); - max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; - + max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val; /* Minimum capacity is close to max, no need to abort wake_affine */ if (max_cap - min_cap < max_cap >> 3) return 0; + /* Bring task utilization in sync with prev_cpu */ + sync_entity_load_avg(&p->se); + return min_cap * 1024 < task_util(p) * capacity_margin; } +static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync) +{ + bool boosted, prefer_idle; + struct sched_domain *sd; + int target_cpu; + int backup_cpu; + int next_cpu; + + schedstat_inc(p->se.statistics.nr_wakeups_secb_attempts); + schedstat_inc(this_rq()->eas_stats.secb_attempts); + + if (sysctl_sched_sync_hint_enable && sync) { + int cpu = smp_processor_id(); + + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { + schedstat_inc(p->se.statistics.nr_wakeups_secb_sync); + schedstat_inc(this_rq()->eas_stats.secb_sync); + return cpu; + } + } + +#ifdef CONFIG_CGROUP_SCHEDTUNE + boosted = schedtune_task_boost(p) > 0; + prefer_idle = schedtune_prefer_idle(p) > 0; +#else + boosted = get_sysctl_sched_cfs_boost() > 0; + prefer_idle = 0; +#endif + + rcu_read_lock(); + + sd = rcu_dereference(per_cpu(sd_ea, prev_cpu)); + if (!sd) { + target_cpu = prev_cpu; + goto unlock; + } + + sync_entity_load_avg(&p->se); + + /* Find a cpu with sufficient capacity */ + next_cpu = find_best_target(p, &backup_cpu, boosted, prefer_idle); + if (next_cpu == -1) { + target_cpu = prev_cpu; + goto unlock; + } + + /* Unconditionally prefer IDLE CPUs for boosted/prefer_idle tasks */ + if ((boosted || prefer_idle) && idle_cpu(next_cpu)) { + schedstat_inc(p->se.statistics.nr_wakeups_secb_idle_bt); + schedstat_inc(this_rq()->eas_stats.secb_idle_bt); + target_cpu = next_cpu; + goto unlock; + } + + target_cpu = prev_cpu; + if (next_cpu != prev_cpu) { + int delta = 0; + struct energy_env eenv = { + .p = p, + .util_delta = task_util(p), + /* Task's previous CPU candidate */ + .cpu[EAS_CPU_PRV] = { + .cpu_id = prev_cpu, + }, + /* Main alternative CPU candidate */ + .cpu[EAS_CPU_NXT] = { + .cpu_id = next_cpu, + }, + /* Backup alternative CPU candidate */ + .cpu[EAS_CPU_BKP] = { + .cpu_id = backup_cpu, + }, + }; + + +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_cpu_util && + p->state == TASK_WAKING) + delta = task_util(p); +#endif + /* Not enough spare capacity on previous cpu */ + if (__cpu_overutilized(prev_cpu, delta)) { + schedstat_inc(p->se.statistics.nr_wakeups_secb_insuff_cap); + schedstat_inc(this_rq()->eas_stats.secb_insuff_cap); + target_cpu = next_cpu; + goto unlock; + } + + /* Check if EAS_CPU_NXT is a more energy efficient CPU */ + if (select_energy_cpu_idx(&eenv) != EAS_CPU_PRV) { + schedstat_inc(p->se.statistics.nr_wakeups_secb_nrg_sav); + schedstat_inc(this_rq()->eas_stats.secb_nrg_sav); + target_cpu = eenv.cpu[eenv.next_idx].cpu_id; + goto unlock; + } + + schedstat_inc(p->se.statistics.nr_wakeups_secb_no_nrg_sav); + schedstat_inc(this_rq()->eas_stats.secb_no_nrg_sav); + target_cpu = prev_cpu; + goto unlock; + } + + schedstat_inc(p->se.statistics.nr_wakeups_secb_count); + schedstat_inc(this_rq()->eas_stats.secb_count); + +unlock: + rcu_read_unlock(); + return target_cpu; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -5591,10 +7192,13 @@ select_task_rq_fair(struct task_struct * if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); - want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) - && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + want_affine = (!wake_wide(p) && !wake_cap(p, cpu, prev_cpu) && + cpumask_test_cpu(cpu, tsk_cpus_allowed(p))); } + if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized)) + return select_energy_cpu_brute(p, prev_cpu, sync); + rcu_read_lock(); for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) @@ -5622,43 +7226,21 @@ select_task_rq_fair(struct task_struct * new_cpu = cpu; } + if (sd && !(sd_flag & SD_BALANCE_FORK)) { + /* + * We're going to need the task's util for capacity_spare_wake + * in find_idlest_group. Sync it up to prev_cpu's + * last_update_time. + */ + sync_entity_load_avg(&p->se); + } + if (!sd) { if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); - } else while (sd) { - struct sched_group *group; - int weight; - - if (!(sd->flags & sd_flag)) { - sd = sd->child; - continue; - } - - group = find_idlest_group(sd, p, cpu, sd_flag); - if (!group) { - sd = sd->child; - continue; - } - - new_cpu = find_idlest_cpu(group, p, cpu); - if (new_cpu == -1 || new_cpu == cpu) { - /* Now try balancing at a lower domain level of cpu */ - sd = sd->child; - continue; - } - - /* Now try balancing at a lower domain level of new_cpu */ - cpu = new_cpu; - weight = sd->span_weight; - sd = NULL; - for_each_domain(cpu, tmp) { - if (weight <= tmp->span_weight) - break; - if (tmp->flags & sd_flag) - sd = tmp; - } - /* while loop will break here if sd == NULL */ + } else { + new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); } rcu_read_unlock(); @@ -5718,6 +7300,8 @@ static void task_dead_fair(struct task_s { remove_entity_load_avg(&p->se); } +#else +#define task_fits_max(p, cpu) true #endif /* CONFIG_SMP */ static unsigned long @@ -5964,6 +7548,8 @@ again: if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); + rq->misfit_task = !task_fits_max(p, rq->cpu); + return p; simple: cfs_rq = &rq->cfs; @@ -5985,9 +7571,12 @@ simple: if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); + rq->misfit_task = !task_fits_max(p, rq->cpu); + return p; idle: + rq->misfit_task = 0; /* * This is OK, because current is on_cpu, which avoids it being picked * for load-balance and preemption/IRQs are still disabled avoiding @@ -6200,6 +7789,13 @@ static unsigned long __read_mostly max_l enum fbq_type { regular, remote, all }; +enum group_type { + group_other = 0, + group_misfit_task, + group_imbalanced, + group_overloaded, +}; + #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 @@ -6218,6 +7814,7 @@ struct lb_env { int new_dst_cpu; enum cpu_idle_type idle; long imbalance; + unsigned int src_grp_nr_running; /* The set of CPUs under consideration for load-balancing */ struct cpumask *cpus; @@ -6228,6 +7825,7 @@ struct lb_env { unsigned int loop_max; enum fbq_type fbq_type; + enum group_type busiest_group_type; struct list_head tasks; }; @@ -6409,7 +8007,9 @@ static void detach_task(struct task_stru p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(env->src_rq, p, 0); + double_lock_balance(env->src_rq, env->dst_rq); set_task_cpu(p, env->dst_cpu); + double_unlock_balance(env->src_rq, env->dst_rq); } /* @@ -6593,12 +8193,19 @@ static void update_blocked_averages(int * list_add_leaf_cfs_rq() for details. */ for_each_leaf_cfs_rq(rq, cfs_rq) { + struct sched_entity *se; + /* throttled entities do not contribute to load */ if (throttled_hierarchy(cfs_rq)) continue; if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) update_tg_load_avg(cfs_rq, 0); + + /* Propagate pending load changes to the parent, if any: */ + se = cfs_rq->tg->se[cpu]; + if (se && !skip_blocked_update(se)) + update_load_avg(se, 0); } raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -6670,12 +8277,6 @@ static unsigned long task_h_load(struct /********** Helpers for find_busiest_group ************************/ -enum group_type { - group_other = 0, - group_imbalanced, - group_overloaded, -}; - /* * sg_lb_stats - stats of a sched_group required for load_balancing */ @@ -6691,6 +8292,7 @@ struct sg_lb_stats { unsigned int group_weight; enum group_type group_type; int group_no_capacity; + int group_misfit_task; /* A cpu has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -6788,13 +8390,46 @@ static unsigned long scale_rt_capacity(i return 1; } +void init_max_cpu_capacity(struct max_cpu_capacity *mcc) +{ + raw_spin_lock_init(&mcc->lock); + mcc->val = 0; + mcc->cpu = -1; +} + static void update_cpu_capacity(struct sched_domain *sd, int cpu) { unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); struct sched_group *sdg = sd->groups; + struct max_cpu_capacity *mcc; + unsigned long max_capacity; + int max_cap_cpu; + unsigned long flags; cpu_rq(cpu)->cpu_capacity_orig = capacity; + capacity *= arch_scale_max_freq_capacity(sd, cpu); + capacity >>= SCHED_CAPACITY_SHIFT; + + mcc = &cpu_rq(cpu)->rd->max_cpu_capacity; + + raw_spin_lock_irqsave(&mcc->lock, flags); + max_capacity = mcc->val; + max_cap_cpu = mcc->cpu; + + if ((max_capacity > capacity && max_cap_cpu == cpu) || + (max_capacity < capacity)) { + mcc->val = capacity; + mcc->cpu = cpu; +#ifdef CONFIG_SCHED_DEBUG + raw_spin_unlock_irqrestore(&mcc->lock, flags); + pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity); + goto skip_unlock; +#endif + } + raw_spin_unlock_irqrestore(&mcc->lock, flags); + +skip_unlock: __attribute__ ((unused)); capacity *= scale_rt_capacity(cpu); capacity >>= SCHED_CAPACITY_SHIFT; @@ -6803,13 +8438,15 @@ static void update_cpu_capacity(struct s cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity; + sdg->sgc->max_capacity = capacity; + sdg->sgc->min_capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity; + unsigned long capacity, max_capacity, min_capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -6822,6 +8459,8 @@ void update_group_capacity(struct sched_ } capacity = 0; + max_capacity = 0; + min_capacity = ULONG_MAX; if (child->flags & SD_OVERLAP) { /* @@ -6846,11 +8485,13 @@ void update_group_capacity(struct sched_ */ if (unlikely(!rq->sd)) { capacity += capacity_of(cpu); - continue; + } else { + sgc = rq->sd->groups->sgc; + capacity += sgc->capacity; } - sgc = rq->sd->groups->sgc; - capacity += sgc->capacity; + max_capacity = max(capacity, max_capacity); + min_capacity = min(capacity, min_capacity); } } else { /* @@ -6860,12 +8501,18 @@ void update_group_capacity(struct sched_ group = child->groups; do { - capacity += group->sgc->capacity; + struct sched_group_capacity *sgc = group->sgc; + + capacity += sgc->capacity; + max_capacity = max(sgc->max_capacity, max_capacity); + min_capacity = min(sgc->min_capacity, min_capacity); group = group->next; } while (group != child->groups); } sdg->sgc->capacity = capacity; + sdg->sgc->max_capacity = max_capacity; + sdg->sgc->min_capacity = min_capacity; } /* @@ -6960,6 +8607,17 @@ group_is_overloaded(struct lb_env *env, return false; } +/* + * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller + * per-cpu capacity than sched_group ref. + */ +static inline bool +group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +{ + return sg->sgc->max_capacity + capacity_margin - SCHED_CAPACITY_SCALE < + ref->sgc->max_capacity; +} + static inline enum group_type group_classify(struct sched_group *group, struct sg_lb_stats *sgs) @@ -6970,9 +8628,44 @@ group_type group_classify(struct sched_g if (sg_imbalanced(group)) return group_imbalanced; + if (sgs->group_misfit_task) + return group_misfit_task; + return group_other; } +#ifdef CONFIG_NO_HZ_COMMON +/* + * idle load balancing data + * - used by the nohz balance, but we want it available here + * so that we can see which CPUs have no tick. + */ +static struct { + cpumask_var_t idle_cpus_mask; + atomic_t nr_cpus; + unsigned long next_balance; /* in jiffy units */ +} nohz ____cacheline_aligned; + +static inline void update_cpu_stats_if_tickless(struct rq *rq) +{ + /* only called from update_sg_lb_stats when irqs are disabled */ + if (cpumask_test_cpu(rq->cpu, nohz.idle_cpus_mask)) { + /* rate limit updates to once-per-jiffie at most */ + if (READ_ONCE(jiffies) <= rq->last_load_update_tick) + return; + + raw_spin_lock(&rq->lock); + update_rq_clock(rq); + cpu_load_update_idle(rq); + update_cfs_rq_load_avg(rq->clock_task, &rq->cfs, false); + raw_spin_unlock(&rq->lock); + } +} + +#else +static inline void update_cpu_stats_if_tickless(struct rq *rq) { } +#endif + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. @@ -6981,11 +8674,12 @@ group_type group_classify(struct sched_g * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. * @overload: Indicate more than one runnable task for any CPU. + * @overutilized: Indicate overutilization for any CPU. */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs, - bool *overload) + bool *overload, bool *overutilized) { unsigned long load; int i, nr_running; @@ -6995,6 +8689,12 @@ static inline void update_sg_lb_stats(st for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); + /* if we are entering idle and there are CPUs with + * their tick stopped, do an update for them + */ + if (env->idle == CPU_NEWLY_IDLE) + update_cpu_stats_if_tickless(rq); + /* Bias balancing toward cpus of our domain */ if (local_group) load = target_load(i, load_idx); @@ -7019,6 +8719,12 @@ static inline void update_sg_lb_stats(st */ if (!nr_running && idle_cpu(i)) sgs->idle_cpus++; + + if (cpu_overutilized(i)) { + *overutilized = true; + if (!sgs->group_misfit_task && rq->misfit_task) + sgs->group_misfit_task = capacity_of(i); + } } /* Adjust by relative CPU capacity of the group */ @@ -7060,9 +8766,31 @@ static bool update_sd_pick_busiest(struc if (sgs->group_type < busiest->group_type) return false; + /* + * Candidate sg doesn't face any serious load-balance problems + * so don't pick it if the local sg is already filled up. + */ + if (sgs->group_type == group_other && + !group_has_capacity(env, &sds->local_stat)) + return false; + if (sgs->avg_load <= busiest->avg_load) return false; + if (!(env->sd->flags & SD_ASYM_CPUCAPACITY)) + goto asym_packing; + + /* + * Candidate sg has no more than one task per CPU and + * has higher per-CPU capacity. Migrating tasks to less + * capable CPUs may harm throughput. Maximize throughput, + * power/energy consequences are not considered. + */ + if (sgs->sum_nr_running <= sgs->group_weight && + group_smaller_cpu_capacity(sds->local, sg)) + return false; + +asym_packing: /* This is the busiest node in its class. */ if (!(env->sd->flags & SD_ASYM_PACKING)) return true; @@ -7117,6 +8845,9 @@ static inline enum fbq_type fbq_classify } #endif /* CONFIG_NUMA_BALANCING */ +#define lb_sd_parent(sd) \ + (sd->parent && sd->parent->groups != sd->parent->groups->next) + /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. @@ -7128,7 +8859,7 @@ static inline void update_sd_lb_stats(st struct sched_group *sg = env->sd->groups; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; - bool overload = false; + bool overload = false, overutilized = false; if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; @@ -7150,7 +8881,7 @@ static inline void update_sd_lb_stats(st } update_sg_lb_stats(env, sg, load_idx, local_group, sgs, - &overload); + &overload, &overutilized); if (local_group) goto next_group; @@ -7172,6 +8903,15 @@ static inline void update_sd_lb_stats(st sgs->group_type = group_classify(sg, sgs); } + /* + * Ignore task groups with misfit tasks if local group has no + * capacity or if per-cpu capacity isn't higher. + */ + if (sgs->group_type == group_misfit_task && + (!group_has_capacity(env, &sds->local_stat) || + !group_smaller_cpu_capacity(sg, sds->local))) + sgs->group_type = group_other; + if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; sds->busiest_stat = *sgs; @@ -7188,10 +8928,23 @@ next_group: if (env->sd->flags & SD_NUMA) env->fbq_type = fbq_classify_group(&sds->busiest_stat); - if (!env->sd->parent) { + env->src_grp_nr_running = sds->busiest_stat.sum_nr_running; + + if (!lb_sd_parent(env->sd)) { /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload; + + /* Update over-utilization (tipping point, U >= 0) indicator */ + if (env->dst_rq->rd->overutilized != overutilized) { + env->dst_rq->rd->overutilized = overutilized; + trace_sched_overutilized(overutilized); + } + } else { + if (!env->dst_rq->rd->overutilized && overutilized) { + env->dst_rq->rd->overutilized = true; + trace_sched_overutilized(true); + } } } @@ -7344,6 +9097,22 @@ static inline void calculate_imbalance(s */ if (busiest->avg_load <= sds->avg_load || local->avg_load >= sds->avg_load) { + /* Misfitting tasks should be migrated in any case */ + if (busiest->group_type == group_misfit_task) { + env->imbalance = busiest->group_misfit_task; + return; + } + + /* + * Busiest group is overloaded, local is not, use the spare + * cycles to maximize throughput + */ + if (busiest->group_type == group_overloaded && + local->group_type <= group_misfit_task) { + env->imbalance = busiest->load_per_task; + return; + } + env->imbalance = 0; return fix_small_imbalance(env, sds); } @@ -7377,6 +9146,11 @@ static inline void calculate_imbalance(s (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_CAPACITY_SCALE; + /* Boost imbalance to allow misfit task to be balanced. */ + if (busiest->group_type == group_misfit_task) + env->imbalance = max_t(long, env->imbalance, + busiest->group_misfit_task); + /* * if *imbalance is less than the average load per runnable task * there is no guarantee that any tasks will be moved so we'll have @@ -7412,6 +9186,10 @@ static struct sched_group *find_busiest_ * this level. */ update_sd_lb_stats(env, &sds); + + if (energy_aware() && !env->dst_rq->rd->overutilized) + goto out_balanced; + local = &sds.local_stat; busiest = &sds.busiest_stat; @@ -7434,11 +9212,19 @@ static struct sched_group *find_busiest_ if (busiest->group_type == group_imbalanced) goto force_balance; - /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && + /* + * When dst_cpu is idle, prevent SMP nice and/or asymmetric group + * capacities from resulting in underutilization due to avg_load. + */ + if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) && busiest->group_no_capacity) goto force_balance; + /* Misfitting tasks should be dealt with regardless of the avg load */ + if (busiest->group_type == group_misfit_task) { + goto force_balance; + } + /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. @@ -7462,7 +9248,8 @@ static struct sched_group *find_busiest_ * might end up to just move the imbalance on another group */ if ((busiest->group_type != group_overloaded) && - (local->idle_cpus <= (busiest->idle_cpus + 1))) + (local->idle_cpus <= (busiest->idle_cpus + 1)) && + !group_smaller_cpu_capacity(sds.busiest, sds.local)) goto out_balanced; } else { /* @@ -7475,6 +9262,7 @@ static struct sched_group *find_busiest_ } force_balance: + env->busiest_group_type = busiest->group_type; /* Looks like there is an imbalance. Compute it */ calculate_imbalance(env, &sds); return sds.busiest; @@ -7533,7 +9321,8 @@ static struct rq *find_busiest_queue(str */ if (rq->nr_running == 1 && wl > env->imbalance && - !check_cpu_capacity(rq, env->sd)) + !check_cpu_capacity(rq, env->sd) && + env->busiest_group_type != group_misfit_task) continue; /* @@ -7591,6 +9380,14 @@ static int need_active_balance(struct lb return 1; } + if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) && + ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) && + env->src_rq->cfs.h_nr_running == 1 && + cpu_overutilized(env->src_cpu) && + !cpu_overutilized(env->dst_cpu)) { + return 1; + } + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } @@ -7639,7 +9436,7 @@ static int load_balance(int this_cpu, st int *continue_balancing) { int ld_moved, cur_ld_moved, active_balance = 0; - struct sched_domain *sd_parent = sd->parent; + struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL; struct sched_group *group; struct rq *busiest; unsigned long flags; @@ -7706,6 +9503,7 @@ redo: more_balance: raw_spin_lock_irqsave(&busiest->lock, flags); + update_rq_clock(busiest); /* * cur_ld_moved - load moved in current iteration @@ -7803,7 +9601,8 @@ more_balance: * excessive cache_hot migrations and active balances. */ if (idle != CPU_NEWLY_IDLE) - sd->nr_balance_failed++; + if (env.src_grp_nr_running > 1) + sd->nr_balance_failed++; if (need_active_balance(&env)) { raw_spin_lock_irqsave(&busiest->lock, flags); @@ -7940,8 +9739,9 @@ static int idle_balance(struct rq *this_ */ this_rq->idle_stamp = rq_clock(this_rq); - if (this_rq->avg_idle < sysctl_sched_migration_cost || - !this_rq->rd->overload) { + if (!energy_aware() && + (this_rq->avg_idle < sysctl_sched_migration_cost || + !this_rq->rd->overload)) { rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) @@ -8032,8 +9832,18 @@ static int active_load_balance_cpu_stop( int busiest_cpu = cpu_of(busiest_rq); int target_cpu = busiest_rq->push_cpu; struct rq *target_rq = cpu_rq(target_cpu); - struct sched_domain *sd; + struct sched_domain *sd = NULL; struct task_struct *p = NULL; + struct task_struct *push_task = NULL; + int push_task_detached = 0; + struct lb_env env = { + .sd = sd, + .dst_cpu = target_cpu, + .dst_rq = target_rq, + .src_cpu = busiest_rq->cpu, + .src_rq = busiest_rq, + .idle = CPU_IDLE, + }; raw_spin_lock_irq(&busiest_rq->lock); @@ -8053,6 +9863,17 @@ static int active_load_balance_cpu_stop( */ BUG_ON(busiest_rq == target_rq); + push_task = busiest_rq->push_task; + if (push_task) { + if (task_on_rq_queued(push_task) && + task_cpu(push_task) == busiest_cpu && + cpu_online(target_cpu)) { + detach_task(push_task, &env); + push_task_detached = 1; + } + goto out_unlock; + } + /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); for_each_domain(target_cpu, sd) { @@ -8062,16 +9883,9 @@ static int active_load_balance_cpu_stop( } if (likely(sd)) { - struct lb_env env = { - .sd = sd, - .dst_cpu = target_cpu, - .dst_rq = target_rq, - .src_cpu = busiest_rq->cpu, - .src_rq = busiest_rq, - .idle = CPU_IDLE, - }; - + env.sd = sd; schedstat_inc(sd->alb_count); + update_rq_clock(busiest_rq); p = detach_one_task(&env); if (p) { @@ -8085,8 +9899,18 @@ static int active_load_balance_cpu_stop( rcu_read_unlock(); out_unlock: busiest_rq->active_balance = 0; + + if (push_task) + busiest_rq->push_task = NULL; + raw_spin_unlock(&busiest_rq->lock); + if (push_task) { + if (push_task_detached) + attach_one_task(target_rq, push_task); + put_task_struct(push_task); + } + if (p) attach_one_task(target_rq, p); @@ -8107,12 +9931,6 @@ static inline int on_null_domain(struct * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. */ -static struct { - cpumask_var_t idle_cpus_mask; - atomic_t nr_cpus; - unsigned long next_balance; /* in jiffy units */ -} nohz ____cacheline_aligned; - static inline int find_new_ilb(void) { int ilb = cpumask_first(nohz.idle_cpus_mask); @@ -8446,9 +10264,14 @@ static inline bool nohz_kick_needed(stru if (time_before(now, nohz.next_balance)) return false; - if (rq->nr_running >= 2) + if (rq->nr_running >= 2 && + (!energy_aware() || cpu_overutilized(cpu))) return true; + /* Do idle load balance if there have misfit task */ + if (energy_aware()) + return rq->misfit_task; + rcu_read_lock(); sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); if (sds) { @@ -8542,6 +10365,47 @@ static void rq_offline_fair(struct rq *r unthrottle_offline_cfs_rqs(rq); } +static inline int +kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu) +{ + int rc = 0; + + /* Invoke active balance to force migrate currently running task */ + raw_spin_lock(&rq->lock); + if (!rq->active_balance) { + rq->active_balance = 1; + rq->push_cpu = new_cpu; + get_task_struct(p); + rq->push_task = p; + rc = 1; + } + raw_spin_unlock(&rq->lock); + + return rc; +} + +void check_for_migration(struct rq *rq, struct task_struct *p) +{ + int new_cpu; + int active_balance; + int cpu = task_cpu(p); + + if (energy_aware() && rq->misfit_task) { + if (rq->curr->state != TASK_RUNNING || + rq->curr->nr_cpus_allowed == 1) + return; + + new_cpu = select_energy_cpu_brute(p, cpu, 0); + if (capacity_orig_of(new_cpu) > capacity_orig_of(cpu)) { + active_balance = kick_active_balance(rq, p, new_cpu); + if (active_balance) + stop_one_cpu_nowait(cpu, + active_load_balance_cpu_stop, + rq, &rq->active_balance_work); + } + } +} + #endif /* CONFIG_SMP */ /* @@ -8559,6 +10423,16 @@ static void task_tick_fair(struct rq *rq if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); + +#ifdef CONFIG_SMP + if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) { + rq->rd->overutilized = true; + trace_sched_overutilized(true); + } + + rq->misfit_task = !task_fits_max(curr, rq->cpu); +#endif + } /* @@ -8645,32 +10519,45 @@ static inline bool vruntime_normalized(s return false; } -static void detach_task_cfs_rq(struct task_struct *p) +#ifdef CONFIG_FAIR_GROUP_SCHED +/* + * Propagate the changes of the sched_entity across the tg tree to make it + * visible to the root + */ +static void propagate_entity_cfs_rq(struct sched_entity *se) { - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 now = cfs_rq_clock_task(cfs_rq); + struct cfs_rq *cfs_rq; - if (!vruntime_normalized(p)) { - /* - * Fix up our vruntime so that the current sleep doesn't - * cause 'unlimited' sleep bonus. - */ - place_entity(cfs_rq, se, 0); - se->vruntime -= cfs_rq->min_vruntime; + /* Start to propagate at parent */ + se = se->parent; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + if (cfs_rq_throttled(cfs_rq)) + break; + + update_load_avg(se, UPDATE_TG); } +} +#else +static void propagate_entity_cfs_rq(struct sched_entity *se) { } +#endif + +static void detach_entity_cfs_rq(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); /* Catch up with the cfs_rq and remove our load when we leave */ - update_cfs_rq_load_avg(now, cfs_rq, false); + update_load_avg(se, 0); detach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); + propagate_entity_cfs_rq(se); } -static void attach_task_cfs_rq(struct task_struct *p) +static void attach_entity_cfs_rq(struct sched_entity *se) { - struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 now = cfs_rq_clock_task(cfs_rq); #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -8680,10 +10567,36 @@ static void attach_task_cfs_rq(struct ta se->depth = se->parent ? se->parent->depth + 1 : 0; #endif - /* Synchronize task with its cfs_rq */ - update_cfs_rq_load_avg(now, cfs_rq, false); + /* Synchronize entity with its cfs_rq */ + update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); + propagate_entity_cfs_rq(se); +} + +static void detach_task_cfs_rq(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (!vruntime_normalized(p)) { + /* + * Fix up our vruntime so that the current sleep doesn't + * cause 'unlimited' sleep bonus. + */ + place_entity(cfs_rq, se, 0); + se->vruntime -= cfs_rq->min_vruntime; + } + + detach_entity_cfs_rq(se); +} + +static void attach_task_cfs_rq(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + attach_entity_cfs_rq(se); if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; @@ -8737,6 +10650,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif #ifdef CONFIG_SMP +#ifdef CONFIG_FAIR_GROUP_SCHED + cfs_rq->propagate_avg = 0; +#endif atomic_long_set(&cfs_rq->removed_load_avg, 0); atomic_long_set(&cfs_rq->removed_util_avg, 0); #endif @@ -8845,7 +10761,8 @@ void online_fair_sched_group(struct task se = tg->se[i]; raw_spin_lock_irq(&rq->lock); - post_init_entity_util_avg(se); + update_rq_clock(rq); + attach_entity_cfs_rq(se); sync_throttle(tg, i); raw_spin_unlock_irq(&rq->lock); } @@ -8937,8 +10854,10 @@ int sched_group_set_shares(struct task_g /* Possible calls to update_curr() need rq clock */ update_rq_clock(rq); - for_each_sched_entity(se) - update_cfs_shares(group_cfs_rq(se)); + for_each_sched_entity(se) { + update_load_avg(se, UPDATE_TG); + update_cfs_shares(se); + } raw_spin_unlock_irqrestore(&rq->lock, flags); }