diff -urpN -X /home/fletch/.diff.exclude 105-numameminfo/fs/proc/proc_misc.c 111-schedstat/fs/proc/proc_misc.c
--- 105-numameminfo/fs/proc/proc_misc.c	Fri May 30 19:03:27 2003
+++ 111-schedstat/fs/proc/proc_misc.c	Fri May 30 19:03:28 2003
@@ -303,6 +303,9 @@ static struct file_operations proc_vmsta
 	.release	= seq_release,
 };
 
+extern int schedstats_read_proc(char *page, char **start, off_t off,
+				 int count, int *eof, void *data);
+
 #ifdef CONFIG_PROC_HARDWARE
 static int hardware_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
@@ -701,6 +704,7 @@ void __init proc_misc_init(void)
 #endif
 		{"locks",	locks_read_proc},
 		{"execdomains",	execdomains_read_proc},
+		{"schedstat",	schedstats_read_proc},
 		{NULL,}
 	};
 	for (p = simple_ones; p->name; p++)
diff -urpN -X /home/fletch/.diff.exclude 105-numameminfo/kernel/sched.c 111-schedstat/kernel/sched.c
--- 105-numameminfo/kernel/sched.c	Fri May 30 19:02:24 2003
+++ 111-schedstat/kernel/sched.c	Fri May 30 19:04:34 2003
@@ -230,6 +230,83 @@ __init void node_nr_running_init(void)
 
 #endif /* CONFIG_NUMA */
 
+
+struct schedstat {
+	/* sys_sched_yield stats */
+	unsigned long yld_exp_empty;
+	unsigned long yld_act_empty;
+	unsigned long yld_both_empty;
+	unsigned long yld_cnt;
+
+	/* schedule stats */
+	unsigned long sched_noswitch;
+	unsigned long sched_switch;
+	unsigned long sched_cnt;
+
+	/* load_balance stats */
+	unsigned long lb_imbalance;
+	unsigned long lb_idle;
+	unsigned long lb_resched;
+	unsigned long lb_cnt;
+	unsigned long lb_nobusy;
+} ____cacheline_aligned;
+
+/*
+ * bump this up when changing the output format or the meaning of an existing
+ * format, so that tools can adapt (or abort)
+ */
+#define SCHEDSTAT_VERSION	1
+
+struct schedstat schedstats[NR_CPUS];
+
+/*
+ * This could conceivably exceed a page's worth of output on machines with
+ * large number of cpus, where large == about 4096/100 or 40ish. Start
+ * worrying when we pass 32, probably. Then this has to stop being a
+ * "simple" entry in proc/proc_misc.c and needs to be an actual seq_file.
+ */
+int schedstats_read_proc(char *page, char **start, off_t off,
+				 int count, int *eof, void *data)
+{
+	struct schedstat sums;
+	int i, len;
+
+	memset(&sums, 0, sizeof(sums));
+	len = sprintf(page, "version %d\n", SCHEDSTAT_VERSION);
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_online(i)) continue;
+		sums.yld_exp_empty += schedstats[i].yld_exp_empty;
+		sums.yld_act_empty += schedstats[i].yld_act_empty;
+		sums.yld_both_empty += schedstats[i].yld_both_empty;
+		sums.yld_cnt += schedstats[i].yld_cnt;
+		sums.sched_noswitch += schedstats[i].sched_noswitch;
+		sums.sched_switch += schedstats[i].sched_switch;
+		sums.sched_switch += schedstats[i].sched_cnt;
+		sums.lb_idle += schedstats[i].lb_idle;
+		sums.lb_resched += schedstats[i].lb_resched;
+		sums.lb_cnt += schedstats[i].lb_cnt;
+		sums.lb_imbalance += schedstats[i].lb_imbalance;
+		sums.lb_nobusy += schedstats[i].lb_nobusy;
+		len += sprintf(page + len,
+		    "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
+		    i, schedstats[i].yld_both_empty,
+		    schedstats[i].yld_act_empty, schedstats[i].yld_exp_empty,
+		    schedstats[i].yld_cnt, schedstats[i].sched_noswitch,
+		    schedstats[i].sched_switch, schedstats[i].sched_cnt,
+		    schedstats[i].lb_idle, schedstats[i].lb_resched,
+		    schedstats[i].lb_cnt, schedstats[i].lb_imbalance,
+		    schedstats[i].lb_nobusy);
+	}
+	len += sprintf(page + len,
+	    "totals %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
+	    sums.yld_both_empty, sums.yld_act_empty, sums.yld_exp_empty,
+	    sums.yld_cnt, sums.sched_noswitch, sums.sched_switch,
+	    sums.sched_cnt, sums.lb_idle, sums.lb_resched, sums.lb_cnt,
+	    sums.lb_imbalance, sums.lb_nobusy);
+
+	return len;
+}
+
 /*
  * task_rq_lock - lock the runqueue a given task resides on and disable
  * interrupts.  Note the ordering: we can safely lookup the task_rq without
@@ -656,7 +733,6 @@ static inline task_t * context_switch(ru
 
 	return prev;
 }
-
 /*
  * nr_running, nr_uninterruptible and nr_context_switches:
  *
@@ -985,10 +1061,16 @@ static void load_balance(runqueue_t *thi
 	struct list_head *head, *curr;
 	task_t *tmp;
 
+	schedstats[this_cpu].lb_cnt++;
+	if (idle)
+		schedstats[this_cpu].lb_idle++;
 	busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask);
-	if (!busiest)
+	if (!busiest) {
+		schedstats[this_cpu].lb_nobusy++;
 		goto out;
+	}
 
+	schedstats[this_cpu].lb_imbalance += imbalance;
 	/*
 	 * We first consider expired tasks. Those will likely not be
 	 * executed in the near future, and they are most likely to
@@ -1243,13 +1325,14 @@ asmlinkage void schedule(void)
 	runqueue_t *rq;
 	prio_array_t *array;
 	struct list_head *queue;
-	int idx;
+	int idx, mycpu = smp_processor_id();
 
 	/*
 	 * Test if we are atomic.  Since do_exit() needs to call into
 	 * schedule() atomically, we ignore that path for now.
 	 * Otherwise, whine if we are scheduling when we should not be.
 	 */
+	schedstats[mycpu].sched_cnt++;
 	if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
 		if (unlikely(in_atomic())) {
 			printk(KERN_ERR "bad: scheduling while atomic!\n");
@@ -1288,6 +1371,7 @@ need_resched:
 pick_next_task:
 	if (unlikely(!rq->nr_running)) {
 #ifdef CONFIG_SMP
+		schedstats[mycpu].lb_resched++;
 		load_balance(rq, 1, cpu_to_node_mask(smp_processor_id()));
 		if (rq->nr_running)
 			goto pick_next_task;
@@ -1302,11 +1386,13 @@ pick_next_task:
 		/*
 		 * Switch the active and expired arrays.
 		 */
+		schedstats[mycpu].sched_switch++;
 		rq->active = rq->expired;
 		rq->expired = array;
 		array = rq->active;
 		rq->expired_timestamp = 0;
 	}
+	schedstats[mycpu].sched_noswitch++;
 
 	idx = sched_find_first_bit(array->bitmap);
 	queue = array->queue + idx;
@@ -1958,6 +2044,7 @@ asmlinkage long sys_sched_yield(void)
 {
 	runqueue_t *rq = this_rq_lock();
 	prio_array_t *array = current->array;
+	int mycpu = smp_processor_id();
 
 	/*
 	 * We implement yielding by moving the task into the expired
@@ -1966,7 +2053,15 @@ asmlinkage long sys_sched_yield(void)
 	 * (special rule: RT tasks will just roundrobin in the active
 	 *  array.)
 	 */
+	schedstats[mycpu].yld_cnt++;
 	if (likely(!rt_task(current))) {
+		if (current->array->nr_active == 1) {
+		    schedstats[mycpu].yld_act_empty++;
+		    if (!rq->expired->nr_active)
+			schedstats[mycpu].yld_both_empty++;
+		} else if (!rq->expired->nr_active) {
+			schedstats[mycpu].yld_exp_empty++;
+		}
 		dequeue_task(current, array);
 		enqueue_task(current, rq->expired);
 	} else {