LouisJenkinsCS · January 21, 2019 04:27
diff --git a/kmeans-dist.chpl b/kmeans-dist.chpl
 use Random;
 use Time;
 use BlockDist;
 // use CommDiagnostics;
 // See: https://chapel-lang.org/docs/master/modules/standard/CommDiagnostics.html

 /* Config variables */
 config const filename = "../data/test_10_10_2.chpl.txt";
 config const outFilename = "out.txt";
 config const k = 2;
 config const maxIter = 10;
 config const numPoints = 10;

 /* Datatypes */
 type Point = 10 * real;

 // Note: This is a bit of information that may help influence the decision to use a 'record' or a 'class'
 // A 'class' is equivalent to a heap-allocated object that is globally visible to all compute-nodes; essentially
 // your run-of-the-mill pointer to a 'struct'. A 'record' on the other-hand is a stack-allocated object that exists
 // on the compute-node it was allocated on. Hence if you initialize a local variable to a 'record' (which will invoke
 // the initializer) and then access it from an 'on' statement (e.g. from another compute-node), by default all accesses
 // will be by-reference. Note that a store/load to a 'wide' reference, being a reference to remote memory, is resolved
 // by the compiler (with runtime assistance) into a PUT/GET respectively. Note as well that in Chapel, invoking a method
 // is not implicitly handled in a Remote-Method-Invocation (RMI), it will invoke the method with the implicit 'this'
 // treated as an wide-reference, meaning any and all accesses to fields will be resolved remotely into, you guessed it,
 // PUT/GET. If you *need* to continue execution on the locale it is allocated on, use an `on this` statement to wrap the
 // rest of code being executed.
 record Cluster {
    var size: int;
    var pointSum: Point;
    var mean: Point;

    proc init() {}

    proc init(point: Point) {
        mean = point;
    }

    proc distance(p: Point) {
        var squaredDiff = (mean - p) ** 2;
        var sum = 0.0;
        for el in squaredDiff {
            sum += el;
        }
        return sqrt(sum);
        // Note: Can be reduced to a single line which will be
        // data-parallel...
        // return sqrt(+ reduce squaredDiff);
    }

    // Used when reducing clusters
    // Might want to make this a 'const ref' to avoid copying by-value
    proc addCluster(other: Cluster) {
        size += other.size;
        pointSum += other.pointSum;
    }

    proc addPoint(ref p: Point) {
        pointSum += p;
        size += 1;
    }

    proc setMean(ref p: Point) {
        mean = p;
    }

    proc calcMean() {
        mean = pointSum / size;
    }
 }

 // Used when reducing clusters
 // Note: addCluster is a side-effect inducing operation. Do you mean to overload '+='?
 // Adding two objects together should _not_ produce any side-effects on either LHS or RHS.
 // Perhaps you should just inline the code to add the two Clusters here and create 
 // a new 'Cluster' and return that?
 proc +(ref c1: Cluster, c2: Cluster) {
    c1.addCluster(c2);
    return c1;
 }

 proc main() {
    // startVerboseComm();
    var pointsSpace = {0..#numPoints};
    var pointsDomain = pointsSpace dmapped Block(boundingBox=pointsSpace);
    var points: [pointsDomain] Point;
    var labels: [pointsDomain] int;
    
    // Note: This is allocated on Locale#0; any and all accesses are resolved to Locale#0 as PUT/GET
    var clustersDomain = {0..#k};
    var clusters: [clustersDomain] Cluster;
    
    /* Read input file */
    var f = open(filename, iomode.r);
    var reader = f.reader();
    for i in pointsDomain {
        points[i] = reader.read(Point);
    }
    f.close();
    reader.close();

    var watch: Timer;
    watch.start();
    /* Algorithm */
    var randStream = new owned RandomStream(real);
    
    // We can initialize this in parallel (which even if not needed in practice may be good practice for you.
    // Since randStream is currently a serialized single-node object, calling 'getNext' in a loop will result
    // in a pretty severe bottleneck (distributed parallel forall exacerbates the problem since you have multiple
    // compute-nodes contesting for a remote 'sync' variable). There is a method called 'iterate' that will perform
    // yield random elements in a data-parallel manner that you can efficiently 'zip' over, which adheres to locality
    // as well as ensures near-optimal performance. I say 'near-optimal' as the implementation of Chapel is anything
    // but optimal right now, but its getting there! :)
    //
    // forall (rng, cluster) in zip(randStream.iterate(clustersDomain), clusters) {
    //  cluster = new Cluster(points[(rng * numPoints) : int]);
    // }
    for i in clustersDomain {
        var idx: int = (randStream.getNext() * numPoints):int;
        clusters[i] = new Cluster(points[idx]);
    }

    for iteration in 0..#maxIter {
        var newClustersPerLocale: [0..#numLocales][clustersDomain] Cluster;
        // You can just iterate over 'points' which will do this in a more efficient way. Note that iteration
        // yields values by reference, hence iterating over 'point in points' will do exactly this! Since you also
        // rely on the label which has the exact same index, you can 'zip' over both 'points' and 'labels' to make
        // for some nice and elegant looking code!
        //
        // forall (label, point) in zip(labels, points)  
        coforall L in Locales {
            on L {
                const indices = points.localSubdomain();
                for i in indices {
                    // You have to be careful with tuples since they create copies
                    ref point = points[i];
                    
                    var minIndex = 0;
                    var minValue = Math.INFINITY;
                    // Note: I can predict this is a _huge_ bottleneck! Especially the invocation of 'cluster.distance'...
                    // This definitely should be distributed! Right now you have a _load imbalance_ to locale #0, but if you
                    // distribute it, say cyclically, it will balance out pretty well and should scale. Also note that this
                    // loop is _serial_ as you do not invoke the parallel (and possibly distributed) iterator which is only
                    // done so via a 'forall'! Hence while you may have this code run over N locales, you only have one
                    // task per locale running! Making this is a 'forall' will _dramatically_ improve performance.
                    // Furthermore, if you want to find the minimum value, you can use 'min reduce *' to find the
                    // smallest value in a data-parallel way!
                    //
                    // var (minValue, minIndex) = min reduce [(idx, cluster) in zip(clustersDomain, clusters)] (cluster.distance(point), idx)
                    for (idx, cluster) in zip(clusters.domain, clusters) {
                        var distance = cluster.distance(point);
                        if (distance < minValue) {
                            minValue = distance;
                            minIndex = idx;
                        }
                    }
                    
                    // label = minIndex;
                    labels[i] = minIndex;
                    newClustersPerLocale[here.id][minIndex].addPoint(point);
                }
            }
        }
        
        // Everything past this should probably be revised given the changes suggested above.
        var newClusters = + reduce newClustersPerLocale;
        
        // 'forall'
        for cluster in newClusters {
            cluster.calcMean();
        }
        clusters = newClusters;
        
    }
    writeln(watch.elapsed());

    // Output labels
    var outf = open(outFilename, iomode.cw);
    var writer = outf.writer();
    for lbl in labels {
        writer.writeln(lbl);
    }
    writer.close();
    outf.close();
    // stopVerboseComm();
 }
	use Random;
	use Time;
	use BlockDist;
	// use CommDiagnostics;
	// See: https://chapel-lang.org/docs/master/modules/standard/CommDiagnostics.html

	/* Config variables */
	config const filename = "../data/test_10_10_2.chpl.txt";
	config const outFilename = "out.txt";
	config const k = 2;
	config const maxIter = 10;
	config const numPoints = 10;

	/* Datatypes */
	type Point = 10 * real;

	// Note: This is a bit of information that may help influence the decision to use a 'record' or a 'class'
	// A 'class' is equivalent to a heap-allocated object that is globally visible to all compute-nodes; essentially
	// your run-of-the-mill pointer to a 'struct'. A 'record' on the other-hand is a stack-allocated object that exists
	// on the compute-node it was allocated on. Hence if you initialize a local variable to a 'record' (which will invoke
	// the initializer) and then access it from an 'on' statement (e.g. from another compute-node), by default all accesses
	// will be by-reference. Note that a store/load to a 'wide' reference, being a reference to remote memory, is resolved
	// by the compiler (with runtime assistance) into a PUT/GET respectively. Note as well that in Chapel, invoking a method
	// is not implicitly handled in a Remote-Method-Invocation (RMI), it will invoke the method with the implicit 'this'
	// treated as an wide-reference, meaning any and all accesses to fields will be resolved remotely into, you guessed it,
	// PUT/GET. If you need to continue execution on the locale it is allocated on, use an `on this` statement to wrap the
	// rest of code being executed.
	record Cluster {
	var size: int;
	var pointSum: Point;
	var mean: Point;

	proc init() {}

	proc init(point: Point) {
	mean = point;
	}

	proc distance(p: Point) {
	var squaredDiff = (mean - p) ** 2;
	var sum = 0.0;
	for el in squaredDiff {
	sum += el;
	}
	return sqrt(sum);
	// Note: Can be reduced to a single line which will be
	// data-parallel...
	// return sqrt(+ reduce squaredDiff);
	}

	// Used when reducing clusters
	// Might want to make this a 'const ref' to avoid copying by-value
	proc addCluster(other: Cluster) {
	size += other.size;
	pointSum += other.pointSum;
	}

	proc addPoint(ref p: Point) {
	pointSum += p;
	size += 1;
	}

	proc setMean(ref p: Point) {
	mean = p;
	}

	proc calcMean() {
	mean = pointSum / size;
	}
	}

	// Used when reducing clusters
	// Note: addCluster is a side-effect inducing operation. Do you mean to overload '+='?
	// Adding two objects together should _not_ produce any side-effects on either LHS or RHS.
	// Perhaps you should just inline the code to add the two Clusters here and create
	// a new 'Cluster' and return that?
	proc +(ref c1: Cluster, c2: Cluster) {
	c1.addCluster(c2);
	return c1;
	}

	proc main() {
	// startVerboseComm();
	var pointsSpace = {0..#numPoints};
	var pointsDomain = pointsSpace dmapped Block(boundingBox=pointsSpace);
	var points: [pointsDomain] Point;
	var labels: [pointsDomain] int;

	// Note: This is allocated on Locale#0; any and all accesses are resolved to Locale#0 as PUT/GET
	var clustersDomain = {0..#k};
	var clusters: [clustersDomain] Cluster;

	/* Read input file */
	var f = open(filename, iomode.r);
	var reader = f.reader();
	for i in pointsDomain {
	points[i] = reader.read(Point);
	}
	f.close();
	reader.close();

	var watch: Timer;
	watch.start();
	/* Algorithm */
	var randStream = new owned RandomStream(real);

	// We can initialize this in parallel (which even if not needed in practice may be good practice for you.
	// Since randStream is currently a serialized single-node object, calling 'getNext' in a loop will result
	// in a pretty severe bottleneck (distributed parallel forall exacerbates the problem since you have multiple
	// compute-nodes contesting for a remote 'sync' variable). There is a method called 'iterate' that will perform
	// yield random elements in a data-parallel manner that you can efficiently 'zip' over, which adheres to locality
	// as well as ensures near-optimal performance. I say 'near-optimal' as the implementation of Chapel is anything
	// but optimal right now, but its getting there! :)
	//
	// forall (rng, cluster) in zip(randStream.iterate(clustersDomain), clusters) {
	// cluster = new Cluster(points[(rng * numPoints) : int]);
	// }
	for i in clustersDomain {
	var idx: int = (randStream.getNext() * numPoints):int;
	clusters[i] = new Cluster(points[idx]);
	}

	for iteration in 0..#maxIter {
	var newClustersPerLocale: [0..#numLocales][clustersDomain] Cluster;
	// You can just iterate over 'points' which will do this in a more efficient way. Note that iteration
	// yields values by reference, hence iterating over 'point in points' will do exactly this! Since you also
	// rely on the label which has the exact same index, you can 'zip' over both 'points' and 'labels' to make
	// for some nice and elegant looking code!
	//
	// forall (label, point) in zip(labels, points)
	coforall L in Locales {
	on L {
	const indices = points.localSubdomain();
	for i in indices {
	// You have to be careful with tuples since they create copies
	ref point = points[i];

	var minIndex = 0;
	var minValue = Math.INFINITY;
	// Note: I can predict this is a _huge_ bottleneck! Especially the invocation of 'cluster.distance'...
	// This definitely should be distributed! Right now you have a _load imbalance_ to locale #0, but if you
	// distribute it, say cyclically, it will balance out pretty well and should scale. Also note that this
	// loop is _serial_ as you do not invoke the parallel (and possibly distributed) iterator which is only
	// done so via a 'forall'! Hence while you may have this code run over N locales, you only have one
	// task per locale running! Making this is a 'forall' will _dramatically_ improve performance.
	// Furthermore, if you want to find the minimum value, you can use 'min reduce *' to find the
	// smallest value in a data-parallel way!
	//
	// var (minValue, minIndex) = min reduce [(idx, cluster) in zip(clustersDomain, clusters)] (cluster.distance(point), idx)
	for (idx, cluster) in zip(clusters.domain, clusters) {
	var distance = cluster.distance(point);
	if (distance < minValue) {
	minValue = distance;
	minIndex = idx;
	}
	}

	// label = minIndex;
	labels[i] = minIndex;
	newClustersPerLocale[here.id][minIndex].addPoint(point);
	}
	}
	}

	// Everything past this should probably be revised given the changes suggested above.
	var newClusters = + reduce newClustersPerLocale;

	// 'forall'
	for cluster in newClusters {
	cluster.calcMean();
	}
	clusters = newClusters;

	}
	writeln(watch.elapsed());

	// Output labels
	var outf = open(outFilename, iomode.cw);
	var writer = outf.writer();
	for lbl in labels {
	writer.writeln(lbl);
	}
	writer.close();
	outf.close();
	// stopVerboseComm();
	}
No results found