using Hybridizer.Runtime.CUDAImports; using System; using System.Threading.Tasks; namespace Plain { class Reduction { [EntryPoint] public static void Run(int N, float[] a, float[] result) { var cache = new SharedMemoryAllocator().allocate(blockDim.x); int tid = threadIdx.x + blockDim.x * blockIdx.x; int cacheIndex = threadIdx.x; float tmp = 0.0F; while (tid < N) { tmp += a[tid]; tid += blockDim.x * gridDim.x; } cache[cacheIndex] = tmp; CUDAIntrinsics.__syncthreads(); int i = blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] += cache[cacheIndex + i]; } CUDAIntrinsics.__syncthreads(); i >>= 1; } if (cacheIndex == 0) { AtomicExpr.apply(ref result[0], cache[0], (x, y) => x + y); } } } } namespace Virtual { public interface IFunc { [Kernel] float run(float x, float y); } public class AddFunc : IFunc { [Kernel] public float run(float x, float y) { return x + y; } } public class Reduction { [EntryPoint] public static void Run(IFunc func, int N, float[] a, float[] result) { var cache = new SharedMemoryAllocator().allocate(blockDim.x); int tid = threadIdx.x + blockDim.x * blockIdx.x; int cacheIndex = threadIdx.x; float tmp = 0.0F; while (tid < N) { tmp = func.run(tmp, a[tid]); tid += blockDim.x * gridDim.x; } cache[cacheIndex] = tmp; CUDAIntrinsics.__syncthreads(); int i = blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] = func.run(cache[cacheIndex], cache[cacheIndex + i]); } CUDAIntrinsics.__syncthreads(); i >>= 1; } if (cacheIndex == 0) { AtomicExpr.apply(ref result[0], cache[0], func.run); } } } } namespace Generics { [HybridTemplateConcept] public interface IFunc { [Kernel] float run(float x, float y); } public struct AddFunc : IFunc { [Kernel] public float run(float x, float y) { return x + y; } } [HybridRegisterTemplate(Specialize = typeof(Reduction))] public class Reduction where T: struct, IFunc { [Kernel] T reductor { get; set; } public Reduction() { reductor = default(T); } [Kernel] public void Run(int N, float[] a, float[] result) { var cache = new SharedMemoryAllocator().allocate(blockDim.x); int tid = threadIdx.x + blockDim.x * blockIdx.x; int cacheIndex = threadIdx.x; float tmp = 0.0F; while (tid < N) { tmp = reductor.run(tmp, a[tid]); tid += blockDim.x * gridDim.x; } cache[cacheIndex] = tmp; CUDAIntrinsics.__syncthreads(); int i = blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] = reductor.run(cache[cacheIndex], cache[cacheIndex + i]); } CUDAIntrinsics.__syncthreads(); i >>= 1; } if (cacheIndex == 0) { AtomicExpr.apply(ref result[0], cache[0], reductor.run); } } } public class EntryPoints { [EntryPoint] public static void ReduceAdd(Reduction reductor, int N, float[] a, float[] result) { reductor.Run(N, a, result); } } } namespace Delegates { public class Reduction { Func localReductor; public Reduction(Func func) { localReductor = func; } [Kernel] public void Reduce(int N, float[] a, float[] result) { var cache = new SharedMemoryAllocator().allocate(blockDim.x); int tid = threadIdx.x + blockDim.x * blockIdx.x; int cacheIndex = threadIdx.x; float tmp = 0.0F; while (tid < N) { tmp = localReductor(tmp, a[tid]); tid += blockDim.x * gridDim.x; } cache[cacheIndex] = tmp; CUDAIntrinsics.__syncthreads(); int i = blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] = localReductor(cache[cacheIndex], cache[cacheIndex + i]); } CUDAIntrinsics.__syncthreads(); i >>= 1; } if (cacheIndex == 0) { AtomicExpr.apply(ref result[0], cache[0], localReductor); } } [Kernel] public static float Add(float x, float y) { return x + y; } } } namespace Delegates2 { public class Reduction { Func localReductor; public Reduction(Func func) { localReductor = func; } [Kernel] public void Reduce(int N, float[] a, float[] result) { Func f = localReductor; var cache = new SharedMemoryAllocator().allocate(blockDim.x); int tid = threadIdx.x + blockDim.x * blockIdx.x; int cacheIndex = threadIdx.x; float tmp = 0.0F; while (tid < N) { tmp = f(tmp, a[tid]); tid += blockDim.x * gridDim.x; } cache[cacheIndex] = tmp; CUDAIntrinsics.__syncthreads(); int i = blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] = f(cache[cacheIndex], cache[cacheIndex + i]); } CUDAIntrinsics.__syncthreads(); i >>= 1; } if (cacheIndex == 0) { AtomicExpr.apply(ref result[0], cache[0], f); } } [Kernel] public static float Add(float x, float y) { return x + y; } } } namespace GpuHybridizerBenchmark { class Program { public static void Main() { int N = 1024 * 1024 * 32; float[] a = new float[N]; Parallel.For(0, N, i => { a[i] = 1.0F; }); float[] result = new float[1]; cudaDeviceProp prop; cuda.GetDeviceProperties(out prop, 0); var runner = HybRunner.Cuda(); { Console.WriteLine("##### PLAIN #####"); dynamic wrapped = runner.Wrap(new Plain.Reduction()); for (int threadCount = 64; threadCount <= prop.maxThreadsPerBlock; threadCount <<= 1) { result[0] = 0.0F; int blocksPerSm = wrapped.MaxBlocksPerSM(new Action(Plain.Reduction.Run), threadCount, 16 + threadCount * sizeof(float)); wrapped.SetDistrib(blocksPerSm * prop.multiProcessorCount, 1, threadCount, 1, 1, threadCount * sizeof(float)); wrapped.Run(N, a, result); Console.WriteLine($"result = {result[0]}"); } } { Console.WriteLine("##### VIRTUAL #####"); dynamic wrapped = runner.Wrap(new Virtual.Reduction()); for (int threadCount = 64; threadCount <= prop.maxThreadsPerBlock; threadCount <<= 1) { result[0] = 0.0F; int blocksPerSm = wrapped.MaxBlocksPerSM(new Action(Virtual.Reduction.Run), threadCount, 16 + threadCount * sizeof(float)); wrapped.SetDistrib(blocksPerSm * prop.multiProcessorCount, 1, threadCount, 1, 1, threadCount * sizeof(float)); wrapped.Run(new Virtual.AddFunc(), N, a, result); Console.WriteLine($"result = {result[0]}"); } } { Console.WriteLine("##### GENERICS #####"); dynamic wrapped = runner.Wrap(new Generics.EntryPoints()); for (int threadCount = 64; threadCount <= prop.maxThreadsPerBlock; threadCount <<= 1) { result[0] = 0.0F; int blocksPerSm = wrapped.MaxBlocksPerSM(new Action, int, float[], float[]>(Generics.EntryPoints.ReduceAdd), threadCount, 16 + threadCount * sizeof(float)); wrapped.SetDistrib(blocksPerSm * prop.multiProcessorCount, 1, threadCount, 1, 1, threadCount * sizeof(float)); wrapped.ReduceAdd(new Generics.Reduction(), N, a, result); Console.WriteLine($"result = {result[0]}"); } } { Console.WriteLine("##### DELEGATES #####"); result[0] = 0.0F; // no occupancy calculator for instance entrypoint (yet) var instance = new Delegates.Reduction(Delegates.Reduction.Add); dynamic wrapped = runner.Wrap(instance); wrapped.SetDistrib(16 * prop.multiProcessorCount, 1, 128, 1, 1, 128 * sizeof(float)); wrapped.Reduce(N, a, result); Console.WriteLine($"result = {result[0]}"); } { Console.WriteLine("##### DELEGATES2 #####"); result[0] = 0.0F; // no occupancy calculator for instance entrypoint (yet) var instance = new Delegates2.Reduction(Delegates2.Reduction.Add); dynamic wrapped = runner.Wrap(instance); wrapped.SetDistrib(16 * prop.multiProcessorCount, 1, 128, 1, 1, 128 * sizeof(float)); wrapped.Reduce(N, a, result); Console.WriteLine($"result = {result[0]}"); } } } }