Chunk large CSV files.
def load(filePath, **kw):
ignore_index = 'index_col' not in kw
chunkIterator = read_csv(filePath, iterator=True, chunksize=10000, **kw)
return concat(chunkIterator, ignore_index=ignore_index)
Limit columns in DataFrames.
from pandas import read_csv
chocolate = read_csv('datasets/UN-Chocolate.csv')
chocolate
chocolate[['Year', 'Flow']]
h5py
numpy.memmap
lru_cache
or dogpile.cache
for computationally intensive operationsfrom dogpile.cache import make_region
region = make_region().configure('dogpile.cache.memory')
cache_on_arguments = region.cache_on_arguments
@cache_on_arguments()
def f(x):
print 'Wheee!'
return x
print f(1)
print f(1) # Cached
print f(2)
print f(2) # Cached
Select features with cross-validation
Select models with cross-validation
Scale samples
Decorrelate samples
Cross-validate with transformations by pipelining
Interpolate missing labels