import numpy as np
from pycuda import autoinit, driver as cuda, gpuarray
from pycuda.compiler import SourceModule
from pycuda.elementwise import ElementwiseKernel
from pycuda.reduction import ReductionKernel
vectorHost = np.array(xrange(256)).astype(np.float32)
device = autoinit.device
print 'GPU Name: {}'.format(device.name())
print 'GPU Memory: {:,}'.format(device.total_memory())
print '\n'.join('{}: {:,}'.format(key, value) for key, value in device.get_attributes().iteritems())
Define computation directly.
# Send vector to device
vectorDevice = gpuarray.to_gpu(vectorHost)
# Get result from device
(2 * vectorDevice).get()
# Get result from device
(gpuarray.dot(vectorDevice, vectorDevice)).get()
Define computation with ElementwiseKernel.
# Prepare kernel
kernel = ElementwiseKernel(
'float *vector',
'vector[i] *= 2')
# Send vector to device
vectorHost = np.array(xrange(256)).astype(np.float32)
vectorDevice = gpuarray.to_gpu(vectorHost)
# Get result from device
kernel(vectorDevice)
vectorHost = vectorDevice.get()
print vectorHost
Define computation with SourceModule.
# Prepare kernel
kernel = SourceModule("""
__global__ void doublify(float *vector) {
vector[threadIdx.x] *= 2;
}
""").get_function('doublify')
# Compute
vectorHost = np.array(xrange(256)).astype(np.float32)
kernel(cuda.InOut(vectorHost), block=(vectorHost.size, 1, 1))
print vectorHost
# Prepare kernel
kernel = SourceModule("""
__global__ void doublify(float *vector) {{
vector[threadIdx.x] *= {scalar};
}}
""".format(scalar=2)).get_function('doublify')
# Compute
vectorHost = np.array(xrange(256)).astype(np.float32)
kernel(cuda.InOut(vectorHost), block=(vectorHost.size, 1, 1))
print vectorHost
# Prepare kernel
kernel = SourceModule("""
__global__ void doublify(float *vector) {
int offset = threadIdx.x + threadIdx.y * blockDim.x;
vector[offset] *= 2;
}
""").get_function('doublify')
# Compute
vectorHost = np.array([[1, 2], [3, 4]]).astype(np.float32)
kernel(cuda.InOut(vectorHost), block=(
vectorHost.shape[0],
vectorHost.shape[1],
1,
))
print vectorHost
Define operation with ReductionKernel.
# Prepare kernel
kernel = ReductionKernel(
np.float32, # Precision for input and output
neutral='0', # Starting value for reduction
map_expr='x[i] * y[i]', # C code defining map()
reduce_expr='a + b', # C code defining reduce()
arguments='float *x, float *y') # Function arguments
# Send vector to device
vectorHost = np.array(xrange(256)).astype(np.float32)
vectorDevice = gpuarray.to_gpu(vectorHost)
# Get result from device
resultHost = kernel(vectorDevice, vectorDevice).get()
print resultHost
# Send vector to device
vectorHost = np.array(xrange(256)).astype(np.float32)
vectorDevice = gpuarray.to_gpu(vectorHost)
# Start timer
start = cuda.Event()
end = cuda.Event()
start.record()
# Get result from device
vectorHost = (2 * vectorDevice).get()
# End timer
end.record()
end.synchronize()
print '{:,} seconds'.format(start.time_till(end) * 1e-3)