Monday, December 28, 2009

My First GPU HelloWorld Program

Finally I got my OpenCL based java programs run on my AMD Phenom-II 965 BE with Radeon HD 5750.



package com.nativelibs4java.opencl.demos;

import static com.nativelibs4java.opencl.JavaCL.createBestContext;
import static com.nativelibs4java.util.NIOUtils.directFloats;

import java.io.IOException;
import java.nio.FloatBuffer;
import java.util.Random;

import com.nativelibs4java.opencl.CLBuildException;
import com.nativelibs4java.opencl.CLContext;
import com.nativelibs4java.opencl.CLEvent;
import com.nativelibs4java.opencl.CLFloatBuffer;
import com.nativelibs4java.opencl.CLKernel;
import com.nativelibs4java.opencl.CLMem;
import com.nativelibs4java.opencl.CLProgram;
import com.nativelibs4java.opencl.CLQueue;
import com.nativelibs4java.opencl.demos.SetupUtils;
import com.nativelibs4java.util.NIOUtils;

/* Derived from
* http://bbboblog.blogspot.com/2009/10/gpgpu-mandelbrot-with-opencl-and-java.html
* http://developer.apple.com/mac/library/samplecode/OpenCL_Hello_World_Example/index.html
*/
public class HelloWorld {
private static final String src = "__kernel void square( \n"
+ " __global float* input, \n"
+ " __global float* output, \n"
+ " const unsigned int count) \n"
+ "{ \n"
+ " int i = get_global_id(0); \n"
+ " if(i < count) \n"
+ " output[i] = input[i] * input[i]; \n"
+ "} \n"
+ "\n";
private static final int DATA_SIZE_ = 32;
private static final int DATA_SIZE = DATA_SIZE_ * DATA_SIZE_;

private static int correct; // number of correct results returned

private static FloatBuffer results_ = NIOUtils.directFloats(DATA_SIZE);
private static FloatBuffer data_ = NIOUtils.directFloats(DATA_SIZE);
private static FloatBuffer output = directFloats(DATA_SIZE);

public static void main(String[] args) {
try {
SetupUtils.failWithDownloadProposalsIfOpenCLNotAvailable();

Random ran = new Random();
for (int i = 0; i < DATA_SIZE; i++)
data_.put(i, ran.nextFloat());

long time = buildAndExecuteKernel(data_, results_, DATA_SIZE, src);
correct = 0;
for (int i = 0; i < DATA_SIZE; i++) {
float v = data_.get(i);
float r = output.get(i);
if (r == v * v) {
correct++;
} else {
System.err.println("ERROR: got:" + r + ", Required-Ans:"+ v);
}
}
System.out.println(DATA_SIZE + " entries are computed in "+ ((time / 1000)) + " microseconds, correct entries:" + correct);
} catch (Exception e) {
System.err.println(e);
e.printStackTrace();
}
}

private static long buildAndExecuteKernel(FloatBuffer data, FloatBuffer results, int dataSize, String src)
throws CLBuildException, IOException {
// Create a context and program using the devices discovered.
CLContext context = createBestContext();
CLQueue queue = context.createDefaultQueue();

long startTime = System.nanoTime();

CLProgram program = context.createProgram(src).build();

CLKernel kernel = program.createKernel("square");
CLFloatBuffer in1 = context.createFloatBuffer(CLMem.Usage.Input, data,false);
CLFloatBuffer out1 = context.createFloatBuffer(CLMem.Usage.Output,results, false);
kernel.setArgs(in1, out1, dataSize);

CLEvent kernelCompletion = kernel.enqueueNDRange(queue,
new int[] { DATA_SIZE }, new int[] { 1 });// null);
kernelCompletion.waitFor();
queue.finish();

// Copy the OpenCL-hosted array back to RAM
out1.read(queue, output, true);

long time = System.nanoTime() - startTime;
return time;
}
}