Heiko Carstens | 12 Aug 19:25 2010
Picon

[PATCH/RFC 3/5] [PATCH] sched: add book scheduling domain

From: Heiko Carstens <heiko.carstens <at> de.ibm.com>

On top of the SMT and MC scheduling domains this adds the BOOK scheduling
domain. This is useful for machines that have a four level cache hierarchy
and but do not fall into the NUMA category.

Signed-off-by: Heiko Carstens <heiko.carstens <at> de.ibm.com>
---

 arch/s390/defconfig      |    1 
 include/linux/sched.h    |   19 +++++++
 include/linux/topology.h |    6 ++
 kernel/sched.c           |  112 ++++++++++++++++++++++++++++++++++++++++++++---
 kernel/sched_fair.c      |   11 ++--
 5 files changed, 137 insertions(+), 12 deletions(-)

diff -urpN linux-2.6/arch/s390/defconfig linux-2.6-patched/arch/s390/defconfig
--- linux-2.6/arch/s390/defconfig	2010-08-02 00:11:14.000000000 +0200
+++ linux-2.6-patched/arch/s390/defconfig	2010-08-11 13:47:23.000000000 +0200
 <at>  <at>  -248,6 +248,7  <at>  <at>  CONFIG_64BIT=y
 CONFIG_SMP=y
 CONFIG_NR_CPUS=32
 CONFIG_HOTPLUG_CPU=y
+# CONFIG_SCHED_BOOK is not set
 CONFIG_COMPAT=y
 CONFIG_SYSVIPC_COMPAT=y
 CONFIG_AUDIT_ARCH=y
diff -urpN linux-2.6/include/linux/sched.h linux-2.6-patched/include/linux/sched.h
--- linux-2.6/include/linux/sched.h	2010-08-11 13:47:16.000000000 +0200
+++ linux-2.6-patched/include/linux/sched.h	2010-08-11 13:47:23.000000000 +0200
 <at>  <at>  -807,7 +807,9  <at>  <at>  enum powersavings_balance_level {
 	MAX_POWERSAVINGS_BALANCE_LEVELS
 };

-extern int sched_mc_power_savings, sched_smt_power_savings;
+extern int sched_smt_power_savings;
+extern int sched_mc_power_savings;
+extern int sched_book_power_savings;

 static inline int sd_balance_for_mc_power(void)
 {
 <at>  <at>  -820,11 +822,23  <at>  <at>  static inline int sd_balance_for_mc_powe
 	return 0;
 }

-static inline int sd_balance_for_package_power(void)
+static inline int sd_balance_for_book_power(void)
 {
 	if (sched_mc_power_savings | sched_smt_power_savings)
 		return SD_POWERSAVINGS_BALANCE;

+	if (!sched_book_power_savings)
+		return SD_PREFER_SIBLING;
+
+	return 0;
+}
+
+static inline int sd_balance_for_package_power(void)
+{
+	if (sched_book_power_savings | sched_mc_power_savings |
+	    sched_smt_power_savings)
+		return SD_POWERSAVINGS_BALANCE;
+
 	return SD_PREFER_SIBLING;
 }

 <at>  <at>  -875,6 +889,7  <at>  <at>  enum sched_domain_level {
 	SD_LV_NONE = 0,
 	SD_LV_SIBLING,
 	SD_LV_MC,
+	SD_LV_BOOK,
 	SD_LV_CPU,
 	SD_LV_NODE,
 	SD_LV_ALLNODES,
diff -urpN linux-2.6/include/linux/topology.h linux-2.6-patched/include/linux/topology.h
--- linux-2.6/include/linux/topology.h	2010-08-11 13:47:16.000000000 +0200
+++ linux-2.6-patched/include/linux/topology.h	2010-08-11 13:47:23.000000000 +0200
 <at>  <at>  -201,6 +201,12  <at>  <at>  int arch_update_cpu_topology(void);
 	.balance_interval	= 64,					\
 }

+#ifdef CONFIG_SCHED_BOOK
+#ifndef SD_BOOK_INIT
+#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
+#endif
+#endif /* CONFIG_SCHED_BOOK */
+
 #ifdef CONFIG_NUMA
 #ifndef SD_NODE_INIT
 #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
diff -urpN linux-2.6/kernel/sched.c linux-2.6-patched/kernel/sched.c
--- linux-2.6/kernel/sched.c	2010-08-11 13:47:23.000000000 +0200
+++ linux-2.6-patched/kernel/sched.c	2010-08-11 13:47:23.000000000 +0200
 <at>  <at>  -6472,7 +6472,9  <at>  <at>  static void sched_domain_node_span(int n
 }
 #endif /* CONFIG_NUMA */

-int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
+int sched_smt_power_savings;
+int sched_mc_power_savings;
+int sched_book_power_savings;

 /*
  * The cpus mask in sched_group and sched_domain hangs off the end.
 <at>  <at>  -6500,6 +6502,7  <at>  <at>  struct s_data {
 	cpumask_var_t		nodemask;
 	cpumask_var_t		this_sibling_map;
 	cpumask_var_t		this_core_map;
+	cpumask_var_t		this_book_map;
 	cpumask_var_t		send_covered;
 	cpumask_var_t		tmpmask;
 	struct sched_group	**sched_group_nodes;
 <at>  <at>  -6511,6 +6514,7  <at>  <at>  enum s_alloc {
 	sa_rootdomain,
 	sa_tmpmask,
 	sa_send_covered,
+	sa_this_book_map,
 	sa_this_core_map,
 	sa_this_sibling_map,
 	sa_nodemask,
 <at>  <at>  -6564,6 +6568,31  <at>  <at>  cpu_to_core_group(int cpu, const struct 
 }
 #endif /* CONFIG_SCHED_MC */

+/*
+ * book sched-domains:
+ */
+#ifdef CONFIG_SCHED_BOOK
+static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
+
+static int
+cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
+		  struct sched_group **sg, struct cpumask *mask)
+{
+	int group = cpu;
+#ifdef CONFIG_SCHED_MC
+	cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
+	group = cpumask_first(mask);
+#elif defined(CONFIG_SCHED_SMT)
+	cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
+	group = cpumask_first(mask);
+#endif
+	if (sg)
+		*sg = &per_cpu(sched_group_book, group).sg;
+	return group;
+}
+#endif /* CONFIG_SCHED_BOOK */
+
 static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
 static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);

 <at>  <at>  -6572,7 +6601,10  <at>  <at>  cpu_to_phys_group(int cpu, const struct 
 		  struct sched_group **sg, struct cpumask *mask)
 {
 	int group;
-#ifdef CONFIG_SCHED_MC
+#ifdef CONFIG_SCHED_BOOK
+	cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
+	group = cpumask_first(mask);
+#elif defined(CONFIG_SCHED_MC)
 	cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
 	group = cpumask_first(mask);
 #elif defined(CONFIG_SCHED_SMT)
 <at>  <at>  -6833,6 +6865,9  <at>  <at>  SD_INIT_FUNC(CPU)
 #ifdef CONFIG_SCHED_MC
  SD_INIT_FUNC(MC)
 #endif
+#ifdef CONFIG_SCHED_BOOK
+ SD_INIT_FUNC(BOOK)
+#endif

 static int default_relax_domain_level = -1;

 <at>  <at>  -6882,6 +6917,8  <at>  <at>  static void __free_domain_allocs(struct 
 		free_cpumask_var(d->tmpmask); /* fall through */
 	case sa_send_covered:
 		free_cpumask_var(d->send_covered); /* fall through */
+	case sa_this_book_map:
+		free_cpumask_var(d->this_book_map); /* fall through */
 	case sa_this_core_map:
 		free_cpumask_var(d->this_core_map); /* fall through */
 	case sa_this_sibling_map:
 <at>  <at>  -6928,8 +6965,10  <at>  <at>  static enum s_alloc __visit_domain_alloc
 		return sa_nodemask;
 	if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
 		return sa_this_sibling_map;
-	if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+	if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
 		return sa_this_core_map;
+	if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+		return sa_this_book_map;
 	if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
 		return sa_send_covered;
 	d->rd = alloc_rootdomain();
 <at>  <at>  -6987,6 +7026,23  <at>  <at>  static struct sched_domain *__build_cpu_
 	return sd;
 }

+static struct sched_domain *__build_book_sched_domain(struct s_data *d,
+	const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+	struct sched_domain *parent, int i)
+{
+	struct sched_domain *sd = parent;
+#ifdef CONFIG_SCHED_BOOK
+	sd = &per_cpu(book_domains, i).sd;
+	SD_INIT(sd, BOOK);
+	set_domain_attribute(sd, attr);
+	cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
+	sd->parent = parent;
+	parent->child = sd;
+	cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
+#endif
+	return sd;
+}
+
 static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
 	const struct cpumask *cpu_map, struct sched_domain_attr *attr,
 	struct sched_domain *parent, int i)
 <at>  <at>  -7044,6 +7100,15  <at>  <at>  static void build_sched_groups(struct s_
 						d->send_covered, d->tmpmask);
 		break;
 #endif
+#ifdef CONFIG_SCHED_BOOK
+	case SD_LV_BOOK: /* set up book groups */
+		cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
+		if (cpu == cpumask_first(d->this_book_map))
+			init_sched_build_groups(d->this_book_map, cpu_map,
+						&cpu_to_book_group,
+						d->send_covered, d->tmpmask);
+		break;
+#endif
 	case SD_LV_CPU: /* set up physical groups */
 		cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
 		if (!cpumask_empty(d->nodemask))
 <at>  <at>  -7091,12 +7156,14  <at>  <at>  static int __build_sched_domains(const s

 		sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
 		sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
+		sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
 		sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
 		sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
 	}

 	for_each_cpu(i, cpu_map) {
 		build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
+		build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
 		build_sched_groups(&d, SD_LV_MC, cpu_map, i);
 	}

 <at>  <at>  -7127,6 +7194,12  <at>  <at>  static int __build_sched_domains(const s
 		init_sched_groups_power(i, sd);
 	}
 #endif
+#ifdef CONFIG_SCHED_BOOK
+	for_each_cpu(i, cpu_map) {
+		sd = &per_cpu(book_domains, i).sd;
+		init_sched_groups_power(i, sd);
+	}
+#endif

 	for_each_cpu(i, cpu_map) {
 		sd = &per_cpu(phys_domains, i).sd;
 <at>  <at>  -7152,6 +7225,8  <at>  <at>  static int __build_sched_domains(const s
 		sd = &per_cpu(cpu_domains, i).sd;
 #elif defined(CONFIG_SCHED_MC)
 		sd = &per_cpu(core_domains, i).sd;
+#elif defined(CONFIG_SCHED_BOOK)
+		sd = &per_cpu(book_domains, i).sd;
 #else
 		sd = &per_cpu(phys_domains, i).sd;
 #endif
 <at>  <at>  -7368,7 +7443,8  <at>  <at>  match2:
 	mutex_unlock(&sched_domains_mutex);
 }

-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+#if defined(CONFIG_SCHED_BOOK) || defined(CONFIG_SCHED_MC) || \
+    defined(CONFIG_SCHED_SMT)
 static void arch_reinit_sched_domains(void)
 {
 	get_online_cpus();
 <at>  <at>  -7405,6 +7481,9  <at>  <at>  static ssize_t sched_power_savings_store
 	case SD_LV_MC:
 		sched_mc_power_savings = level;
 		break;
+	case SD_LV_BOOK:
+		sched_book_power_savings = level;
+		break;
 	default:
 		break;
 	}
 <at>  <at>  -7414,6 +7493,24  <at>  <at>  static ssize_t sched_power_savings_store
 	return count;
 }

+#ifdef CONFIG_SCHED_BOOK
+static ssize_t sched_book_power_savings_show(struct sysdev_class *class,
+					     struct sysdev_class_attribute *attr,
+					     char *page)
+{
+	return sprintf(page, "%u\n", sched_book_power_savings);
+}
+static ssize_t sched_book_power_savings_store(struct sysdev_class *class,
+					      struct sysdev_class_attribute *attr,
+					      const char *buf, size_t count)
+{
+	return sched_power_savings_store(buf, count, SD_LV_BOOK);
+}
+static SYSDEV_CLASS_ATTR(sched_book_power_savings, 0644,
+			 sched_book_power_savings_show,
+			 sched_book_power_savings_store);
+#endif
+
 #ifdef CONFIG_SCHED_MC
 static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
 					   struct sysdev_class_attribute *attr,
 <at>  <at>  -7464,9 +7561,14  <at>  <at>  int __init sched_create_sysfs_power_savi
 		err = sysfs_create_file(&cls->kset.kobj,
 					&attr_sched_mc_power_savings.attr);
 #endif
+#ifdef CONFIG_SCHED_BOOK
+	if (!err && book_capable())
+		err = sysfs_create_file(&cls->kset.kobj,
+					&attr_sched_book_power_savings.attr);
+#endif
 	return err;
 }
-#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+#endif /* CONFIG_SCHED_BOOK || CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

 /*
  * Update cpusets according to cpu_active mask.  If cpusets are
diff -urpN linux-2.6/kernel/sched_fair.c linux-2.6-patched/kernel/sched_fair.c
--- linux-2.6/kernel/sched_fair.c	2010-08-11 13:47:16.000000000 +0200
+++ linux-2.6-patched/kernel/sched_fair.c	2010-08-11 13:47:23.000000000 +0200
 <at>  <at>  -2039,7 +2039,8  <at>  <at>  struct sd_lb_stats {
 	unsigned long busiest_group_capacity;

 	int group_imb; /* Is there imbalance in this sd */
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+#if defined(CONFIG_SCHED_BOOK) || defined(CONFIG_SCHED_MC) || \
+    defined(CONFIG_SCHED_SMT)
 	int power_savings_balance; /* Is powersave balance needed for this sd */
 	struct sched_group *group_min; /* Least loaded group in sd */
 	struct sched_group *group_leader; /* Group which relieves group_min */
 <at>  <at>  -2096,8 +2097,8  <at>  <at>  static inline int get_sd_load_idx(struct
 	return load_idx;
 }

-
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+#if defined(CONFIG_SCHED_BOOK) || defined(CONFIG_SCHED_MC) || \
+    defined(CONFIG_SCHED_SMT)
 /**
  * init_sd_power_savings_stats - Initialize power savings statistics for
  * the given sched_domain, during load balancing.
 <at>  <at>  -2217,7 +2218,7  <at>  <at>  static inline int check_power_save_busie
 	return 1;

 }
-#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+#else /* CONFIG_SCHED_BOOK || CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 static inline void init_sd_power_savings_stats(struct sched_domain *sd,
 	struct sd_lb_stats *sds, enum cpu_idle_type idle)
 {
 <at>  <at>  -2235,7 +2236,7  <at>  <at>  static inline int check_power_save_busie
 {
 	return 0;
 }
-#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+#endif /* CONFIG_SCHED_BOOK || CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

 
 unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)


Gmane