In [ ]:

import numpy as np
from pycuda import autoinit, driver as cuda, gpuarray
from pycuda.compiler import SourceModule
from pycuda.elementwise import ElementwiseKernel
from pycuda.reduction import ReductionKernel

vectorHost = np.array(xrange(256)).astype(np.float32)

In [ ]:

device = autoinit.device
print 'GPU Name: {}'.format(device.name())
print 'GPU Memory: {:,}'.format(device.total_memory())
print '\n'.join('{}: {:,}'.format(key, value) for key, value in device.get_attributes().iteritems())

Define computation¶

Define computation directly.

In [ ]:

# Send vector to device
vectorDevice = gpuarray.to_gpu(vectorHost)
# Get result from device
(2 * vectorDevice).get()

In [ ]:

# Get result from device
(gpuarray.dot(vectorDevice, vectorDevice)).get()

Define computation with ElementwiseKernel.

In [ ]:

# Prepare kernel
kernel = ElementwiseKernel(
    'float *vector',
    'vector[i] *= 2')
# Send vector to device
vectorHost = np.array(xrange(256)).astype(np.float32)
vectorDevice = gpuarray.to_gpu(vectorHost)
# Get result from device
kernel(vectorDevice)
vectorHost = vectorDevice.get()
print vectorHost

Define computation with SourceModule.

In [ ]:

# Prepare kernel
kernel = SourceModule("""
    __global__ void doublify(float *vector) {
        vector[threadIdx.x] *= 2;
    }
""").get_function('doublify')
# Compute
vectorHost = np.array(xrange(256)).astype(np.float32)
kernel(cuda.InOut(vectorHost), block=(vectorHost.size, 1, 1))
print vectorHost

In [ ]:

# Prepare kernel
kernel = SourceModule("""
    __global__ void doublify(float *vector) {{
        vector[threadIdx.x] *= {scalar};
    }}
""".format(scalar=2)).get_function('doublify')
# Compute
vectorHost = np.array(xrange(256)).astype(np.float32)
kernel(cuda.InOut(vectorHost), block=(vectorHost.size, 1, 1))
print vectorHost

In [ ]:

# Prepare kernel
kernel = SourceModule("""
    __global__ void doublify(float *vector) {
        int offset = threadIdx.x + threadIdx.y * blockDim.x;
        vector[offset] *= 2;
    }
""").get_function('doublify')
# Compute
vectorHost = np.array([[1, 2], [3, 4]]).astype(np.float32)
kernel(cuda.InOut(vectorHost), block=(
    vectorHost.shape[0], 
    vectorHost.shape[1], 
    1,
))
print vectorHost

Define operation with ReductionKernel.

In [ ]:

# Prepare kernel
kernel = ReductionKernel(
    np.float32,                     # Precision for input and output
    neutral='0',                    # Starting value for reduction
    map_expr='x[i] * y[i]',         # C code defining map()
    reduce_expr='a + b',            # C code defining reduce()
    arguments='float *x, float *y') # Function arguments
# Send vector to device
vectorHost = np.array(xrange(256)).astype(np.float32)
vectorDevice = gpuarray.to_gpu(vectorHost)
# Get result from device
resultHost = kernel(vectorDevice, vectorDevice).get()
print resultHost

Time computation¶

In [ ]:

# Send vector to device
vectorHost = np.array(xrange(256)).astype(np.float32)
vectorDevice = gpuarray.to_gpu(vectorHost)

# Start timer
start = cuda.Event()
end = cuda.Event()
start.record()

# Get result from device
vectorHost = (2 * vectorDevice).get()

# End timer
end.record()
end.synchronize()
print '{:,} seconds'.format(start.time_till(end) * 1e-3)

Pay Notebook Creator: Roy Hyunjin Han	0
Set Container: Numerical CPU with TINY Memory for 10 Minutes	0
Total	0

Introduction to Computational Analysis

Define computation¶

Time computation¶