# CrossCompute
child_labor_table_path = 'child_labor.csv'
gdp_table_path = 'gdp.csv'
target_folder = '/tmp'
import pip
pip.main(['install', 'statsmodels', 'pydotplus'])
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pydotplus
from IPython.display import Image
import numpy as np
%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 10)
df = pd.read_csv(child_labor_table_path)
gdp = pd.read_csv(gdp_table_path)
df.head()
len(set(df["Country"]))
gdp.head()
Some definitions in this data set:
Child Labor refers to child slavery.
Forced Labor refers to adult slavery. First, we will engage in some simple explanatory analysis.
Let's see how many instances of Child Labor there are as a proportion of the countries surveyed.
child_labor = pd.DataFrame()
countries = list(set(df['Country']))
for country in countries:
tmp_df = df[df['Country'] == country]
num_child_produced = len(tmp_df[tmp_df["Child Labor"] == 'X'])
child_labor = child_labor.append({
"Country": country,
"num_child_produced": num_child_produced
}, ignore_index=True)
labels = 'child labor', 'not child labor'
countries_with_child_labor = len(child_labor[child_labor['num_child_produced'] > 0])
countries_without_child_labor = len(child_labor[child_labor['num_child_produced'] == 0])
sizes = [countries_with_child_labor, countries_without_child_labor]
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%',
shadow=True, startangle=90)
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
target_path = target_folder + '/child-labor-percent.png'
plt.savefig(target_path)
print('child_labor_percent_image_path = %s' % target_path)
So, as we can see, 98.7% of all countries surveyed have some form of child labor.
labels = 'more than one industry', 'one or fewer industries'
more_than_one_industry = len(child_labor[child_labor['num_child_produced'] > 1])
one_or_fewer_industries = len(child_labor[child_labor['num_child_produced'] <= 1])
sizes = [more_than_one_industry, one_or_fewer_industries]
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%',
shadow=True, startangle=90)
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
And 71.1% have more than one industry where child labor is used.
forced_labor = pd.DataFrame()
countries = list(set(df['Country']))
for country in countries:
tmp_df = df[df['Country'] == country]
num_forced_produced = len(tmp_df[tmp_df["Forced Labor"] == 'X'])
forced_labor = forced_labor.append({
"Country": country,
"num_forced_produced": num_forced_produced
}, ignore_index=True)
labels = 'forced labor', 'not forced labor'
countries_with_forced_labor = len(forced_labor[forced_labor['num_forced_produced'] > 0])
countries_without_forced_labor = len(forced_labor[forced_labor['num_forced_produced'] == 0])
sizes = [countries_with_forced_labor, countries_without_forced_labor]
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%',
shadow=True, startangle=90)
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
So, as we can see, 48.7% of all countries surveyed have some form of forced labor.
labels = 'more than one good', 'one or fewer goods'
more_than_one_industry = len(forced_labor[forced_labor['num_forced_produced'] > 1])
one_or_fewer_industries = len(forced_labor[forced_labor['num_forced_produced'] <= 1])
sizes = [more_than_one_industry, one_or_fewer_industries]
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%',
shadow=True, startangle=90)
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
And 22.4% have more than one industry where forced labor is used.
gems = ['Diamonds', 'Emeralds', 'Gems', 'Jade', 'Rubies', 'Sapphires', 'Tanzanite (gems)']
minerals = ['Zinc, ''Wolframite (tungsten ore)', 'Trona (mineral)', 'Silver',
'Iron', 'Heterogenite (cobalt ore)', 'Gypsum (mineral)', 'Granite (crushed)',
'Granite', 'Gold', 'Coltan (tantalum ore)', 'Copper', 'Cassiterite (tin ore)',
'Fluorspar (mineral)', 'Tin', 'Zinc', 'Wolframite (tungsten ore)', 'Gravel (crushed stones)']
food_related = ['Alcoholic Beverages', 'Baked Goods', 'Bananas',
'Beans (green beans)', 'Beans (green, soy, yellow)',
'Beef', 'Blueberries', 'Brazil Nuts/Chestnuts',
'Broccoli', 'Cashews', 'Chile Peppers', 'Citrus Fruits',
'Cloves', 'Coca (stimulant plant)', 'Cocoa',
'Coconuts', 'Coffee', 'Corn', 'Cucumbers', 'Cumin',
'Dried Fish', 'Eggplants', 'Fish', 'Garlic', 'Grapes',
'Goats', 'Hazelnuts', 'Hogs', 'Lobsters', 'Meat',
'Melons', 'Miraa (stimulant plant)', 'Olives',
'Onions', 'Peanuts', 'Pepper', 'Physic Nuts/Castor Beans',
'Potatoes', 'Poultry', 'Pulses (legumes)', 'Rice', 'Salt',
'Sesame', 'Shellfish', 'Shrimp', 'Strawberries', 'Sugar Beets',
'Sugarcane', 'Tea', 'Tomatoes', 'Vanilla', 'Wheat', 'Yerba Mate (stimulant plant)',
'Nile Perch (fish)', 'Pineapples', 'Tilapia (fish)', 'Cattle', 'Oil (Palm)',
'Oil (palm)', 'Sisal', 'Manioc/Cassava']
decorations = ['Artificial Flowers', 'Flowers', 'Flowers (poppies)', 'Sunflowers']
construction = ['Bricks', 'Bricks (clay)', 'Cement', 'Ceramics', 'Glass', 'Nails',
'Palm Thatch', 'Sand', 'Stones', 'Stones (limestone)', 'Timber', 'Rubber', 'Rubber ',
'Bamboo']
cigarettes = ['Bidis (hand-rolled cigarettes)', 'Tobacco']
home_durables = ['Brassware', 'Ceramics', 'Furniture', 'Furniture (steel)', 'Glass', 'Glass Bangles',
'Locks', 'Matches', 'Soap', 'Stones (pumice)', 'Teak']
garments_and_fabrics = ['Carpets', 'Cotton', 'Embellished Textiles', 'Footwear', 'Footwear (sandals)',
'Garments', 'Garments ', 'Leather','Leather Goods/Accessories', 'Silk Cocoons',
'Silk Fabric', 'Silk Thread', 'Textiles', 'Textiles (hand-woven)', 'Textiles (jute)',
'Thread/Yarn', 'Cottonseed (hybrid)', 'Fashion Accessories']
celebratory = ['Christmas Decorations', 'Fireworks', 'Incense (agarbatti)', 'Matches', 'Pyrotechnics',
'Soccer Balls', 'Toys']
energy = ['Charcoal', 'Coal']
medical = ['Surgical Instruments']
sexual_exploitation = ['Pornography']
technology = ['Electronics']
def categorize_industry(x):
if x["Good"] in food_related:
x["industry"] = 'food'
if x["Good"] in gems:
x["industry"] = 'gems'
if x["Good"] in minerals:
x["industry"] = 'minerals'
if x["Good"] in decorations:
x["industry"] = "decorations"
if x["Good"] in construction:
x["industry"] = "construction"
if x["Good"] in cigarettes:
x["industry"] = "cigarettes"
if x["Good"] in home_durables:
x["industry"] = "home_durables"
if x["Good"] in garments_and_fabrics:
x["industry"] = "garments"
if x["Good"] in celebratory:
x["industry"] = "celebratory"
if x["Good"] in energy:
x["industry"] = "energy"
if x["Good"] in medical:
x["industry"] = "medical"
if x["Good"] in sexual_exploitation:
x["industry"] = "sexual exploitation"
if x["Good"] in technology:
x["industry"] = "technology"
return x
df = df.apply(categorize_industry, axis=1)
for index in range(len(df)):
if pd.isnull(df.iloc[index]["industry"]):
print(df.iloc[index]["Good"])
industry_counts = df["industry"].value_counts()
x_vals = list(set(df["industry"]))
y_vals = [industry_counts[elem] for elem in x_vals]
pos = list(range(len(x_vals)))
plt.bar(pos, y_vals, align="center")
plt.xticks(pos, x_vals)
plt.show()
countries = set(df["Country"])
num_industries = []
for country in countries:
tmp = df[df["Country"] == country]
num_industries.append((country, len(tmp)))
df["num_goods"] = np.nan
for index in df.index:
for industry in num_industries:
if df.ix[index]["Country"] == industry[0]:
df.at[index, "num_goods"] = industry[1]
df.head()
new_gdp = pd.DataFrame()
new_df = pd.DataFrame()
gdp["Country"] = gdp["countries"]
for country in set(gdp["Country"]):
if country in list(df["Country"]):
new_gdp = new_gdp.append(gdp[gdp["Country"] == country])
for country in set(df["Country"]):
if country in list(new_gdp["Country"]):
tmp = df[df["Country"] == country]
if not tmp.empty:
new_df = new_df.append(tmp.iloc[0])
len(new_gdp), len(new_df)
new_gdp = new_gdp.sort_values("Country")
new_df = new_df.sort_values("Country")
new_gdp.index = list(range(len(new_gdp)))
new_df.index = list(range(len(new_df)))
from scipy import stats
print(stats.spearmanr(new_gdp["2016"], new_df["num_goods"]))
print(stats.spearmanr(new_gdp["2017"], new_df["num_goods"]))
industry_model = sm.OLS(new_gdp["2016"], new_df["num_goods"])
result = industry_model.fit()
result.summary()
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(new_df["num_goods"],
new_gdp["2016"],
test_size=0.3, random_state=42)
X_train = X_train.reshape(-1, 1)
X_test = X_test.reshape(-1, 1)
from sklearn import tree
from sklearn import metrics
import math
dtree = tree.DecisionTreeRegressor()
dtree_model = dtree.fit(X_train, y_train)
predictions = dtree_model.predict(X_test)
print("RMSE Results", math.sqrt(metrics.mean_squared_error(y_test.values, predictions)))
from sklearn import ensemble
original_params = {'n_estimators': 1000, 'max_leaf_nodes': 17, 'max_depth': None, 'random_state': 2,
'min_samples_split': 5}
setting = {'learning_rate': 0.1, 'subsample': 1.0}
params = dict(original_params)
params.update(setting)
gbr = ensemble.GradientBoostingRegressor(**params)
gbr_tree = tree.DecisionTreeRegressor()
gbr.fit(X_train, y_train)
predictions = gbr.predict(X_test)
print("RMSE Results", math.sqrt(metrics.mean_squared_error(y_test.values, predictions)))
output = gbr.predict(X_train)
gbr_tree_model = gbr_tree.fit(X_train, output)
from sklearn import svm
svr = svm.SVR()
svr_tree = tree.DecisionTreeRegressor()
svr.fit(X_train, y_train)
predictions = svr.predict(X_test)
print("RMSE Results", math.sqrt(metrics.mean_squared_error(y_test.values, predictions)))
output = svr.predict(X_train)
svr_tree_model = svr_tree.fit(X_train, output)
dot_data = tree.export_graphviz(svr_tree_model, out_file=None,
feature_names=["num_goods"],
class_names="GDP",
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
picture = Image(graph.create_png())
picture
industry_model = sm.OLS(new_gdp["2017"], new_df["num_goods"])
result = industry_model.fit()
result.summary()
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(new_df["num_goods"],
new_gdp["2017"],
test_size=0.3, random_state=42)
X_train = X_train.reshape(-1, 1)
X_test = X_test.reshape(-1, 1)
from sklearn import tree
from sklearn import metrics
import math
dtree = tree.DecisionTreeRegressor()
dtree_model = dtree.fit(X_train, y_train)
predictions = dtree_model.predict(X_test)
print("RMSE Results", math.sqrt(metrics.mean_squared_error(y_test.values, predictions)))
from sklearn import ensemble
original_params = {'n_estimators': 1000, 'max_leaf_nodes': 17, 'max_depth': None, 'random_state': 2,
'min_samples_split': 5}
setting = {'learning_rate': 0.1, 'subsample': 1.0}
params = dict(original_params)
params.update(setting)
gbr = ensemble.GradientBoostingRegressor(**params)
gbr_tree = tree.DecisionTreeRegressor()
gbr.fit(X_train, y_train)
predictions = gbr.predict(X_test)
print("RMSE Results", math.sqrt(metrics.mean_squared_error(y_test.values, predictions)))
output = gbr.predict(X_train)
gbr_tree_model = gbr_tree.fit(X_train, output)
from sklearn import svm
svr = svm.SVR()
svr_tree = tree.DecisionTreeRegressor()
svr.fit(X_train, y_train)
predictions = svr.predict(X_test)
print("RMSE Results", math.sqrt(metrics.mean_squared_error(y_test.values, predictions)))
output = svr.predict(X_train)
svr_tree_model = svr_tree.fit(X_train, output)
dot_data = tree.export_graphviz(svr_tree_model, out_file=None,
feature_names=["num_goods"],
class_names="GDP",
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
picture = Image(graph.create_png())
picture
df.head()
df = pd.merge(df, child_labor, on="Country")
df.head()
df = pd.merge(df, forced_labor, on="Country")
df.head()
industries = pd.get_dummies(df["industry"])
industries["Country"] = df["Country"]
df = pd.merge(df, industries, on="Country")
df.head()
new_gdp = pd.DataFrame()
new_df = pd.DataFrame()
gdp["Country"] = gdp["countries"]
for country in set(gdp["Country"]):
if country in list(df["Country"]):
new_gdp = new_gdp.append(gdp[gdp["Country"] == country])
for country in set(df["Country"]):
if country in list(new_gdp["Country"]):
tmp = df[df["Country"] == country]
if not tmp.empty:
new_df = new_df.append(tmp.iloc[0])
len(new_gdp), len(new_df)
new_gdp = new_gdp.sort_values("Country")
new_df = new_df.sort_values("Country")
new_gdp.index = list(range(len(new_gdp)))
new_df.index = list(range(len(new_df)))
new_df.head()
new_df.drop("Good", axis=1, inplace=True)
new_df.drop("Country", axis=1, inplace=True)
new_df.drop("Child Labor", axis=1, inplace=True)
new_df.drop("Forced Labor", axis=1, inplace=True)
new_df.drop("industry", axis=1, inplace=True)
new_df.head()
multivariate_model = sm.OLS(new_gdp["2016"], new_df)
result = multivariate_model.fit()
result.summary()
X = new_df[["construction",
"decorations",
"food",
"home_durables",
"medical",
"num_child_produced",
"num_forced_produced",
"sexual exploitation"]]
multivariate_model = sm.OLS(new_gdp["2016"], X)
result = multivariate_model.fit()
result.summary()
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(new_df,
new_gdp["2016"],
test_size=0.3, random_state=42)
from sklearn import tree
from sklearn import metrics
import math
dtree = tree.DecisionTreeRegressor()
dtree_model = dtree.fit(X_train, y_train)
predictions = dtree_model.predict(X_test)
print("RMSE Results", math.sqrt(metrics.mean_squared_error(y_test.values, predictions)))
from sklearn import ensemble
original_params = {'n_estimators': 1000, 'max_leaf_nodes': 17, 'max_depth': None, 'random_state': 2,
'min_samples_split': 5}
setting = {'learning_rate': 0.1, 'subsample': 1.0}
params = dict(original_params)
params.update(setting)
gbr = ensemble.GradientBoostingRegressor(**params)
gbr_tree = tree.DecisionTreeRegressor()
gbr.fit(X_train, y_train)
predictions = gbr.predict(X_test)
print("RMSE Results", math.sqrt(metrics.mean_squared_error(y_test.values, predictions)))
output = gbr.predict(X_train)
gbr_tree_model = gbr_tree.fit(X_train, output)
from sklearn import svm
svr = svm.SVR()
svr_tree = tree.DecisionTreeRegressor()
svr.fit(X_train, y_train)
predictions = svr.predict(X_test)
print("RMSE Results", math.sqrt(metrics.mean_squared_error(y_test.values, predictions)))
output = svr.predict(X_train)
svr_tree_model = svr_tree.fit(X_train, output)
dot_data = tree.export_graphviz(svr_tree_model, out_file=None,
feature_names=new_df.columns,
class_names="GDP",
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
picture = Image(graph.create_png())
picture
new_df.columns
for feature in range(len(svr_tree_model.feature_importances_)):
print(new_df.columns[feature], svr_tree_model.feature_importances_[feature])
X = new_df[["num_forced_produced", "num_goods"]]
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,
new_gdp["2016"],
test_size=0.3, random_state=42)
svr = svm.SVR()
svr_tree = tree.DecisionTreeRegressor()
svr.fit(X_train, y_train)
predictions = svr.predict(X_test)
print("RMSE Results", math.sqrt(metrics.mean_squared_error(y_test.values, predictions)))
output = svr.predict(X_train)
svr_tree_model = svr_tree.fit(X_train, output)
dot_data = tree.export_graphviz(svr_tree_model, out_file=None,
feature_names=X.columns,
class_names="GDP",
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
picture = Image(graph.create_png())
picture
Economic Features:
For social factors I was going to look at:
For technological factors I was going to look at:
For political factors I was going to look at:
df.head()
df = df.sort_values("num_goods")
set(df.iloc[-1000:]["Country"])
for country in set(df.iloc[-1000:]["Country"]):
print(country, df[df["Country"] == country]["num_goods"].iloc[0])
set(df[df["Country"] == 'India']["industry"])
set(df[df["Country"] == "India"]["Good"])