This patch was written primarily by Christoph Lameter to 
allow numactl +nn syntax for cpuset_relative cpu and node numbers.
Some adjustments were made by Cliff Wickman.

It reads /proc/%d/status to determine the current task's allowed cpus
and nodes.

(Cliff also tweaked test/regress to allow "make test" to complete
 successfully on a 24-cpu ia64.)

Diffed against numactl-1.0.2

Signed-off-by: Cliff Wickman <cpw@sgi.com>
---
 libnuma.c    |  147 ++++++++++++++++++++++++++++++++++++++++++++++++-----------
 numactl.8    |   56 +++++++++++++++++++---
 numactl.c    |   23 +++++----
 test/regress |    5 +-
 util.c       |   78 +++++++++++++++++++++++--------
 5 files changed, 247 insertions(+), 62 deletions(-)

Index: numactl-1.0.2/util.c
===================================================================
--- numactl-1.0.2.orig/util.c
+++ numactl-1.0.2/util.c
@@ -49,18 +49,42 @@ void printmask(char *name, nodemask_t *m
 	putchar('\n');
 } 
 
+/*
+ * Extract a node or processor number from the given string.
+ * Allow a relative node / processor specification within the allowed
+ * set if a + is prepended to the number.
+ */
+unsigned long get_nr(char *s, char **end, int max, unsigned long *mask)
+{
+	unsigned long i, nr;
+
+	if (*s != '+')
+		return strtoul(s, end, 0);
+	s++;
+	nr = strtoul(s, end, 0);
+	if (s == *end)
+		return nr;
+	/* Find the nth set bit */
+	for (i = 0; nr > 0 && i <= max; i++)
+		if (test_bit(i, mask))
+			nr--;
+	if (nr)
+		*end = s;
+	return i;
+
+}
+
 int numcpus; 
+extern unsigned long numa_all_cpus[];
+extern int maxcpus;
 
 /* caller must free buffer */
 unsigned long *cpumask(char *s, int *ncpus) 
 {
-	int invert = 0;
+	int invert = 0, relative=0;
 	char *end; 
 
-	if (!numcpus) 
-		numcpus = sysconf(_SC_NPROCESSORS_CONF); 
-
-	int cpubufsize = round_up(numcpus, BITS_PER_LONG) / BYTES_PER_LONG;
+	int cpubufsize = round_up(maxcpus, BITS_PER_LONG) / 8;
 	unsigned long *cpubuf = calloc(cpubufsize,1); 
 	if (!cpubuf) 
 		complain("Out of memory");
@@ -80,22 +104,32 @@ unsigned long *cpumask(char *s, int *ncp
 				set_bit(i, cpubuf);
 			break;
 		}
-		arg = strtoul(s, &end, 0); 
+		if (*s == '+') relative++;
+		arg = get_nr(s, &end, maxcpus, numa_all_cpus);
 		if (end == s)
 			complain("unparseable node description `%s'\n", s);
-		if (arg > numcpus)
+		if (arg > maxcpus)
 			complain("cpu argument %d is out of range\n", arg);
 		set_bit(arg, cpubuf);
 		s = end; 
-		if (*s == '-') { 
+		if (*s == '-') {
 			char *end2;
-			unsigned long arg2 = strtoul(++s, &end2, 0); 
+			unsigned long arg2;
+			if (relative && *(s+1) != '+') {
+				*s = '+';
+				arg2 = get_nr(s,&end2,maxcpus,numa_all_cpus);
+			} else {
+				arg2 = get_nr(++s,&end2,maxcpus,numa_all_cpus);
+			}
 			if (end2 == s)
 				complain("missing cpu argument %s\n", s);
-			if (arg > numcpus)
-				complain("cpu argument %d out of range\n", arg);
-			while (++arg <= arg2)
-				set_bit(arg, cpubuf);
+			if (arg2 > maxcpus)
+				complain("cpu argument %d out of range\n",arg2);
+			while (arg <= arg2) {
+				if (test_bit(arg, numa_all_cpus))
+					set_bit(arg, cpubuf);
+				arg++;
+			}
 			s = end2;
 		}
 	} while (*s++ == ','); 
@@ -103,7 +137,7 @@ unsigned long *cpumask(char *s, int *ncp
 		usage();
 	if (invert) { 
 		int i;
-		for (i = 0; i <= numcpus; i++) {
+		for (i = 0; i <= maxcpus; i++) {
 			if (test_bit(i, cpubuf))
 				clear_bit(i, cpubuf);
 			else
@@ -146,7 +180,7 @@ nodemask_t nodemask(char *s) 
 			mask = numa_all_nodes;
 			break;
 		}
-		arg = strtoul(s, &end, 0); 
+		arg = get_nr(s, &end, max, (unsigned long *)numa_all_nodes.n);
 		if (end == s)
 			complain("unparseable node description `%s'\n", s);
 		if (arg > max)
@@ -155,13 +189,17 @@ nodemask_t nodemask(char *s) 
 		s = end; 
 		if (*s == '-') { 
 			char *end2;
-			unsigned long arg2 = strtoul(++s, &end2, 0); 
+			unsigned long arg2 = get_nr(++s, &end2, max,
+					(unsigned long *)numa_all_nodes.n);
 			if (end2 == s)
 				complain("missing cpu argument %s\n", s);
-			if (arg > max)
-				complain("node argument %d out of range\n", arg);
-			while (++arg <= arg2)
-				nodemask_set(&mask, arg);
+			if (arg2 > max)
+				complain("node argument %d out of range\n",arg2);
+			while (arg <= arg2) {
+				if (nodemask_isset(&numa_all_nodes, arg))
+					nodemask_set(&mask, arg);
+				arg++;
+			}
 			s = end2;
 		}
 	} while (*s++ == ','); 
Index: numactl-1.0.2/libnuma.c
===================================================================
--- numactl-1.0.2.orig/libnuma.c
+++ numactl-1.0.2/libnuma.c
@@ -37,7 +37,8 @@
 
 #define WEAK __attribute__((weak))
 
-#define CPU_BUFFER_SIZE 4096     /* This limits you to 32768 CPUs */
+#define MAX_NR_CPUS		4096
+#define CPU_BUFFER_SIZE ((MAX_NR_CPUS + 8 - 1) / 8)
 
 const nodemask_t numa_no_nodes;
 const nodemask_t numa_all_nodes;
@@ -115,8 +116,10 @@ int numa_pagesize(void)
 
 make_internal_alias(numa_pagesize);
 
-static int maxnode = -1; 
-static int maxcpus = -1; 
+int maxnode = -1;
+int maxcpus = -1;
+
+unsigned long numa_all_cpus[(CPU_BUFFER_SIZE + BYTES_PER_LONG - 1) / BYTES_PER_LONG];
 
 static int number_of_configured_cpus(void)
 { 
@@ -150,33 +153,130 @@ static int fallback_max_node(void)
 	return maxnode;
 }
 
-int numa_max_node(void)
+/*
+ * Read a mask consisting of a sequence of hexadecimal longs separated by
+ * commas. Order them correctly and return the number of the last bit
+ * set.
+ */
+int read_mask(char *s, unsigned long *mask)
+{
+	char *end = s;
+	unsigned int *start = (unsigned int *)mask;
+	unsigned int *p = start;
+	unsigned int *q;
+	unsigned int i;
+	unsigned int n = 0;
+
+	i = strtoul(s, &end, 16);
+
+	/* Skip leading zeros */
+	while (!i && *end++ == ',')
+		i = strtoul(end, &end, 16);
+
+	if (!i)
+		/* End of string. No mask */
+		return -1;
+
+	/* Read sequence of ints */
+	do {
+		start[n++] = i;
+		i = strtoul(end, &end, 16);
+	} while (*end++ == ',');
+	n--;
+
+	/*
+	 * Invert sequence of ints if necessary since the first int
+	 * is the highest and we put it first because we read it first.
+	 */
+	for (q = start + n, p = start; p < q; q--, p++) {
+		unsigned int x = *q;
+
+		*q = *p;
+		*p = x;
+	}
+
+	/* Poor mans fls() */
+	for(i = 31; i >= 0; i--)
+		if (test_bit(i, start + n))
+			break;
+
+	/*
+	 * Return the last bit set
+	 */
+	return sizeof(unsigned int) * n + i;
+}
+
+/*
+ * Read a processes constraints in terms of nodes and cpus from /proc/pid/status.
+ */
+int read_constraints(void)
+{
+	FILE *f;
+	/*
+	 * The maximum line size consists of the string at the beginning plus
+	 * a digit for each 4 cpus and a comma for each 64 cpus.
+	 */
+	char buffer[MAX_NR_CPUS / 4 + MAX_NR_CPUS / BITS_PER_LONG + 20];
+
+	sprintf(buffer,"/proc/%d/status", getpid());
+	f = fopen(buffer, "r");
+	if (!f)
+		return 0;
+
+	while (fgets(buffer, sizeof(buffer), f)) {
+
+		if (strncmp(buffer,"Cpus_allowed",12) == 0)
+			maxcpus = read_mask(buffer + 14, numa_all_cpus);
+
+		if (strncmp(buffer,"Mems_allowed",12) == 0) {
+			*(nodemask_t *)&numa_all_nodes = numa_no_nodes;
+			maxnode = read_mask(buffer + 14,
+					(unsigned long *)numa_all_nodes.n);
+		}
+	}
+	fclose(f);
+
+	if (maxnode < 0)
+		return 0;
+
+	return 1;
+}
+
+void determine_nodes(void)
 {
 	DIR *d;
 	struct dirent *de;
 	int found;
 
-	/* No hotplug yet. */
-	if (maxnode >= 0) 
-		return maxnode;
-	d = opendir("/sys/devices/system/node"); 
+	d = opendir("/sys/devices/system/node");
 	if (!d)
-		return fallback_max_node();
+		goto fail;
+
 	found = 0;
-	while ((de = readdir(d)) != NULL) { 
+	while ((de = readdir(d)) != NULL) {
 		int nd;
 		if (strncmp(de->d_name, "node", 4))
 			continue;
 		found++;
-		nd = strtoul(de->d_name+4, NULL, 0); 
-		if (maxnode < nd) 
-			maxnode = nd; 
-	} 
-	closedir(d); 
-	if (found == 0) 
-		return fallback_max_node();
+		nd = strtoul(de->d_name+4, NULL, 0);
+		if (maxnode < nd)
+			maxnode = nd;
+	}
+	closedir(d);
+	if (found)
+		return;
+fail:
+	maxnode  = fallback_max_node();
+}
+int numa_max_node(void)
+{
+	if (maxnode >= 0)
+		return maxnode;
+	if (!read_constraints())
+		determine_nodes();
+
 	return maxnode;
-} 
+}
 
 make_internal_alias(numa_max_node);
 
@@ -242,12 +342,9 @@ long numa_node_size(int node, long *free
 
 int numa_available(void)
 {
-	int max,i;
 	if (get_mempolicy_int(NULL, NULL, 0, 0, 0) < 0 && errno == ENOSYS) 
 		return -1; 
-	max = numa_max_node_int();
-	for (i = 0; i <= max; i++) 
-		nodemask_set((nodemask_t *)&numa_all_nodes, i); 
+	numa_max_node_int();
 	return 0;
 } 
 
@@ -429,10 +526,10 @@ int numa_parse_bitmap(char *line, unsign
 		if (*p == ',')
 			p++;
 		if (i >= CPU_LONGS(ncpus))
-			return -1;
+			return 0; /* filled the mask */
 		mask[i] = strtoul(p, &endp, 16);
 		if (endp != oldp)
-			return -1;
+			return 0;  /* we filled the mask */
 		p--;
 	}
 	return 0;
@@ -557,7 +654,7 @@ make_internal_alias(numa_run_on_node_mas
 
 nodemask_t numa_get_run_node_mask(void)
 { 
-	int ncpus = number_of_configured_cpus();
+	int ncpus = NUMA_NUM_NODES;
 	nodemask_t mask;
 	int i, k;
 	int max = numa_max_node_int();
Index: numactl-1.0.2/numactl.8
===================================================================
--- numactl-1.0.2.orig/numactl.8
+++ numactl-1.0.2/numactl.8
@@ -34,7 +34,7 @@ numactl \- Control NUMA policy for proce
 .B \-\-physcpubind cpus
 ] [
 .B \-\-localalloc
-] command {arguments ...}
+] [\-\-] command {arguments ...}
 .br
 .B numactl \-\-show
 .br
@@ -72,40 +72,75 @@ runs processes with a specific NUMA sche
 The policy is set for command and inherited by all of its children.
 In addition it can set persistent policy for shared memory segments or files.
 .TP
+Use -- before command if using command options that could be confused
+with numactl options.
+.TP
 Policy settings are:
 .TP
 .B \-\-interleave=nodes, \-i nodes
-Set an memory interleave policy. Memory will be allocated using round robin
+Set a memory interleave policy. Memory will be allocated using round robin
 on 
 .I nodes.
 When memory cannot be allocated on the current interleave target fall back
 to other nodes.
+Multiple nodes may be specified on --interleave, --membind and --cpunodebind.
+You may specify "all", which means all nodes in the current cpuset.
+.I nodes
+may be specified as N,N,N or  N-N or N,N-N or  N-N,N-N and so forth.
+Relative
+.I nodes
+may be specifed as +N,N,N or  +N-N or +N,N-N and so forth. The + indicates that
+the node numbers are relative to the process' set of allowed nodes in its
+current cpuset.
+A !N-N notation indicates the inverse of N-N, in other words all nodes
+except N-N.  If used with + notation, specify !+N-N.
 .TP
 .B \-\-membind=nodes, \-m nodes
 Only allocate memory from nodes.  Allocation will fail when there
 is not enough memory available on these nodes.
+.I nodes
+may be specified as noted above.
 .TP
 .B \-\-cpunodebind=nodes, \-N nodes
-Only execute process on the CPUs of
+Only execute
+.I command
+on the CPUs of
 .I nodes. 
 Note that nodes may consist of multiple CPUs.
+.I nodes
+may be specified as noted above.
 .TP
 .B \-\-physcpubind=cpus, \-C cpus
-Only execute process on 
+Only execute
+.I process
+on
 .I cpus.
 This accepts physical cpu numbers as shown in the 
 .I processor
 fields of 
-.I /proc/cpuinfo.
+.I /proc/cpuinfo,
+or relative cpus as in relative to the current cpuset.
+You may specify "all", which means all cpus in the current cpuset.
+Physical
+.I cpus
+may be specified as N,N,N or  N-N or N,N-N or  N-N,N-N and so forth.
+Relative
+.I cpus
+may be specifed as +N,N,N or  +N-N or +N,N-N and so forth. The + indicates that
+the cpu numbers are relative to the process' set of allowed cpus in its
+current cpuset.
+A !N-N notation indicates the inverse of N-N, in other words all cpus
+except N-N.  If used with + notation, specify !+N-N.
 .TP
 .B \-\-localalloc, \-l 
-Do always local allocation on the current node.
+Always allocate on the current node.
 .TP
 .B \-\-preferred=node
 Preferably allocate memory on 
 .I node,
 but if memory cannot be allocated there fall back to other nodes.
 This option takes only a single node number.
+Relative notation may be used.
 .TP
 .B \-\-show, \-s
 Show NUMA policy settings of the current process. 
@@ -201,12 +236,19 @@ number1-number2:Nodes from number1 to nu
 ! nodes:Invert selection of the following specification.
 .TE
 .SH EXAMPLES
+numactl \-\-physcpubind=+0-4,8-12 myapplic arguments
+Run myapplic on cpus 0-4 and 8-12 of the current cpuset.
+
 numactl \-\-interleave=all bigdatabase arguments
 Run big database with its memory interleaved on all CPUs.
 
-numactl \-\-cpubind=0\-\-membind=0,1 process
+numactl \-\-cpubind=0 \-\-membind=0,1 process
 Run process on node 0 with memory allocated on node 0 and 1.
 
+numactl \-\-cpubind=0 \-\-membind=0,1 -- process -l
+Run process as above, but with an option (-l) that would be confused with
+a numactl option.
+
 numactl \-\-preferred=1 numactl \-\-show
 Set preferred node 1 and show the resulting state.
 
Index: numactl-1.0.2/numactl.c
===================================================================
--- numactl-1.0.2.orig/numactl.c
+++ numactl-1.0.2/numactl.c
@@ -74,6 +74,7 @@ void usage(void)
 		"nodes is a comma delimited list of node numbers or A-B ranges or all.\n"
 		"cpus is a comma delimited list of cpu numbers or A-B ranges or all\n"
 		"all ranges can be inverted with !\n"
+		"all numbers and ranges can be made cpuset-relative with +\n"
 		"the old --cpubind argument is deprecated.\n"
 		"use --cpunodebind or --physcpubind instead\n"
 		"length can have g (GB), m (MB) or k (KB) suffixes\n");
@@ -316,8 +317,8 @@ void get_short_opts(struct option *o, ch
 
 int main(int ac, char **av)
 {
-	int c;
-	long arg; 
+	int c, i, nnodes=0;
+	long node=-1;
 	char *end;
 	char shortopts[array_len(opts)*2 + 1];
 	get_short_opts(opts,shortopts);
@@ -357,6 +358,7 @@ int main(int ac, char **av)
 		{
 			int ncpus;
 			unsigned long *cpubuf;
+			numa_max_node();
 			dontshm("-C/--physcpubind");
 			cpubuf = cpumask(optarg, &ncpus);
 			errno = 0;
@@ -384,18 +386,21 @@ int main(int ac, char **av)
 		case 'p': /* --preferred */
 			checknuma();
 			setpolicy(MPOL_PREFERRED);
-			arg = strtoul(optarg,&end,0); 
-			if (*end || end == optarg || arg < 0 || arg > numa_max_node()) 
+			mask = nodemask(optarg);
+			for (i=0; i<sizeof(mask); i++) {
+				if (nodemask_isset(&mask, i)) {
+					node = i;
+					nnodes++;
+				}
+			}
+			if (nnodes != 1)
 				usage();
 			errno = 0;
 			numa_set_bind_policy(0);
-			nodemask_zero(&mask);
-			nodemask_set(&mask, arg);
-			numa_set_bind_policy(0);
 			if (shmfd >= 0) 
-				numa_tonode_memory(shmptr, shmlen, arg);
+				numa_tonode_memory(shmptr, shmlen, node);
 			else
-				numa_set_preferred(arg);
+				numa_set_preferred(node);
 			checkerror("setting preferred node");
 			break;
 		case 'l': /* --local */
Index: numactl-1.0.2/test/regress
===================================================================
--- numactl-1.0.2.orig/test/regress
+++ numactl-1.0.2/test/regress
@@ -4,11 +4,12 @@
 # Copyright 2003,2004 Andi Kleen, SuSE Labs.
 
 MB=$[1024*1024]
-SIZE=$[30 * $MB]
+SIZE=$[15 * $MB]
 DEMOSIZE=$[10 * $MB]
 VALGRIND=${VALGRIND:-}
 STAT_INTERVAL=5
 
+# cpw: use numactl or numactl.old
 numactl() { 
 	$VALGRIND ../numactl "$@"
 }
@@ -34,7 +35,9 @@ fi
 PAGESIZE=`./pagesize`
 PAGES=`expr $SIZE / $PAGESIZE`
 HALFPAGES=`expr $PAGES / 2`
+HALFPAGES=`expr $HALFPAGES - 100`
 DOUBLEPAGES=`expr $PAGES \* 2`
+DOUBLEPAGES=`expr $DOUBLEPAGES - 200`
 
 FAILED='========SUCCESS'
 
