In [36]:

# CrossCompute
child_labor_table_path = 'child_labor.csv'
gdp_table_path = 'gdp.csv'
target_folder = '/tmp'

In [37]:

import pip
pip.main(['install', 'statsmodels', 'pydotplus'])

Requirement already satisfied: statsmodels in /home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages
Requirement already satisfied: pydotplus in /home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages
Requirement already satisfied: pandas in /usr/lib64/python3.6/site-packages (from statsmodels)
Requirement already satisfied: patsy in /home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages (from statsmodels)
Requirement already satisfied: scipy in /usr/lib64/python3.6/site-packages (from statsmodels)
Requirement already satisfied: pyparsing>=2.0.1 in /usr/lib/python3.6/site-packages (from pydotplus)
Requirement already satisfied: python-dateutil>=2 in /home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages (from pandas->statsmodels)
Requirement already satisfied: pytz>=2011k in /usr/lib/python3.6/site-packages (from pandas->statsmodels)
Requirement already satisfied: numpy>=1.7.0 in /usr/lib64/python3.6/site-packages (from pandas->statsmodels)
Requirement already satisfied: six in /home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages (from patsy->statsmodels)

Out[37]:

In [38]:

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pydotplus 
from IPython.display import Image  
import numpy as np
%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 10)

In [39]:

df = pd.read_csv(child_labor_table_path)

In [40]:

gdp = pd.read_csv(gdp_table_path)

In [9]:

df.head()

Out[9]:

	Country	Good	Child Labor	Forced Labor
0	Afghanistan	Bricks	X	X
1	Afghanistan	Carpets	X	NaN
2	Afghanistan	Coal	X	NaN
3	Afghanistan	Flowers (poppies)	X	NaN
4	Angola	Diamonds	X	X

In [10]:

len(set(df["Country"]))

Out[10]:

In [11]:

gdp.head()

Out[11]:

	Unnamed: 0	2016	2017	2018	2019	2020	countries
0	0	18.395	19.290	20.604	22.297	24.222	Afghanistan
1	1	12.144	12.876	13.764	14.813	15.997	Albania
2	2	168.318	178.431	184.189	189.297	195.367	Algeria
3	3	91.939	102.315	108.588	114.449	120.822	Angola
4	4	1.303	1.358	1.429	1.505	1.585	Antigua and Barbuda

Understanding our data¶

Some definitions in this data set:

Child Labor refers to child slavery.

Forced Labor refers to adult slavery. First, we will engage in some simple explanatory analysis.

Let's see how many instances of Child Labor there are as a proportion of the countries surveyed.

Child Labor¶

In [12]:

child_labor = pd.DataFrame()
countries = list(set(df['Country']))
for country in countries:
    tmp_df = df[df['Country'] == country]
    num_child_produced = len(tmp_df[tmp_df["Child Labor"] == 'X'])
    child_labor = child_labor.append({
        "Country": country,
        "num_child_produced": num_child_produced
    }, ignore_index=True)

In [13]:

labels = 'child labor', 'not child labor'
countries_with_child_labor = len(child_labor[child_labor['num_child_produced'] > 0])
countries_without_child_labor = len(child_labor[child_labor['num_child_produced'] == 0])
sizes = [countries_with_child_labor, countries_without_child_labor]
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

target_path = target_folder + '/child-labor-percent.png'
plt.savefig(target_path)
print('child_labor_percent_image_path = %s' % target_path)

So, as we can see, 98.7% of all countries surveyed have some form of child labor.

In [14]:

labels = 'more than one industry', 'one or fewer industries'
more_than_one_industry = len(child_labor[child_labor['num_child_produced'] > 1])
one_or_fewer_industries = len(child_labor[child_labor['num_child_produced'] <= 1])
sizes = [more_than_one_industry, one_or_fewer_industries]
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

And 71.1% have more than one industry where child labor is used.

Forced Labor¶

In [15]:

forced_labor = pd.DataFrame()
countries = list(set(df['Country']))
for country in countries:
    tmp_df = df[df['Country'] == country]
    num_forced_produced = len(tmp_df[tmp_df["Forced Labor"] == 'X'])
    forced_labor = forced_labor.append({
        "Country": country,
        "num_forced_produced": num_forced_produced
    }, ignore_index=True)

In [16]:

labels = 'forced labor', 'not forced labor'
countries_with_forced_labor = len(forced_labor[forced_labor['num_forced_produced'] > 0])
countries_without_forced_labor = len(forced_labor[forced_labor['num_forced_produced'] == 0])
sizes = [countries_with_forced_labor, countries_without_forced_labor]
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

So, as we can see, 48.7% of all countries surveyed have some form of forced labor.

In [17]:

labels = 'more than one good', 'one or fewer goods'
more_than_one_industry = len(forced_labor[forced_labor['num_forced_produced'] > 1])
one_or_fewer_industries = len(forced_labor[forced_labor['num_forced_produced'] <= 1])
sizes = [more_than_one_industry, one_or_fewer_industries]
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

And 22.4% have more than one industry where forced labor is used.

In [18]:

gems = ['Diamonds', 'Emeralds', 'Gems', 'Jade', 'Rubies', 'Sapphires', 'Tanzanite (gems)']
minerals = ['Zinc, ''Wolframite (tungsten ore)', 'Trona (mineral)', 'Silver',
           'Iron', 'Heterogenite (cobalt ore)', 'Gypsum (mineral)', 'Granite (crushed)',
           'Granite', 'Gold', 'Coltan (tantalum ore)', 'Copper', 'Cassiterite (tin ore)', 
            'Fluorspar (mineral)', 'Tin', 'Zinc', 'Wolframite (tungsten ore)', 'Gravel (crushed stones)']
food_related = ['Alcoholic Beverages', 'Baked Goods', 'Bananas',
               'Beans (green beans)', 'Beans (green, soy, yellow)',
               'Beef', 'Blueberries', 'Brazil Nuts/Chestnuts',
               'Broccoli', 'Cashews', 'Chile Peppers', 'Citrus Fruits',
               'Cloves', 'Coca (stimulant plant)', 'Cocoa', 
               'Coconuts', 'Coffee', 'Corn', 'Cucumbers', 'Cumin',
               'Dried Fish', 'Eggplants', 'Fish', 'Garlic', 'Grapes',
               'Goats', 'Hazelnuts', 'Hogs', 'Lobsters', 'Meat',
               'Melons', 'Miraa (stimulant plant)', 'Olives',
               'Onions', 'Peanuts', 'Pepper', 'Physic Nuts/Castor Beans',
               'Potatoes', 'Poultry', 'Pulses (legumes)', 'Rice', 'Salt',
               'Sesame', 'Shellfish', 'Shrimp', 'Strawberries', 'Sugar Beets',
               'Sugarcane', 'Tea', 'Tomatoes', 'Vanilla', 'Wheat', 'Yerba Mate (stimulant plant)',
               'Nile Perch (fish)', 'Pineapples', 'Tilapia (fish)', 'Cattle', 'Oil (Palm)',
               'Oil (palm)', 'Sisal', 'Manioc/Cassava']
decorations = ['Artificial Flowers', 'Flowers', 'Flowers (poppies)', 'Sunflowers']
construction = ['Bricks', 'Bricks (clay)', 'Cement', 'Ceramics', 'Glass', 'Nails',
               'Palm Thatch', 'Sand', 'Stones', 'Stones (limestone)', 'Timber', 'Rubber', 'Rubber ',
               'Bamboo']
cigarettes = ['Bidis (hand-rolled cigarettes)', 'Tobacco']
home_durables = ['Brassware', 'Ceramics', 'Furniture', 'Furniture (steel)', 'Glass', 'Glass Bangles',
                'Locks', 'Matches', 'Soap', 'Stones (pumice)', 'Teak']
garments_and_fabrics = ['Carpets', 'Cotton', 'Embellished Textiles', 'Footwear', 'Footwear (sandals)',
                       'Garments', 'Garments ', 'Leather','Leather Goods/Accessories', 'Silk Cocoons',
                       'Silk Fabric', 'Silk Thread', 'Textiles', 'Textiles (hand-woven)', 'Textiles (jute)',
                       'Thread/Yarn', 'Cottonseed (hybrid)', 'Fashion Accessories']
celebratory = ['Christmas Decorations', 'Fireworks', 'Incense (agarbatti)', 'Matches', 'Pyrotechnics',
              'Soccer Balls', 'Toys']
energy = ['Charcoal', 'Coal']
medical = ['Surgical Instruments']
sexual_exploitation = ['Pornography']
technology = ['Electronics']

def categorize_industry(x):    
    if x["Good"] in food_related:
        x["industry"] = 'food'
    if x["Good"] in gems:
        x["industry"] = 'gems'
    if x["Good"] in minerals:
        x["industry"] = 'minerals'
    if x["Good"] in decorations:
        x["industry"] = "decorations"
    if x["Good"] in construction:
        x["industry"] = "construction"
    if x["Good"] in cigarettes:
        x["industry"] = "cigarettes"
    if x["Good"] in home_durables:
        x["industry"] = "home_durables"
    if x["Good"] in garments_and_fabrics:
        x["industry"] = "garments"
    if x["Good"] in celebratory:
        x["industry"] = "celebratory"
    if x["Good"] in energy:
        x["industry"] = "energy"
    if x["Good"] in medical:
        x["industry"] = "medical"
    if x["Good"] in sexual_exploitation:
        x["industry"] = "sexual exploitation"
    if x["Good"] in technology:
        x["industry"] = "technology"
    return x

df = df.apply(categorize_industry, axis=1)

In [19]:

for index in range(len(df)):
    if pd.isnull(df.iloc[index]["industry"]):
        print(df.iloc[index]["Good"])

In [20]:

industry_counts = df["industry"].value_counts()
x_vals = list(set(df["industry"]))
y_vals = [industry_counts[elem] for elem in x_vals]
pos = list(range(len(x_vals)))
plt.bar(pos, y_vals, align="center")
plt.xticks(pos, x_vals)
plt.show()

What is the effect of number of goods which employ forced labor on GDP?¶

In [21]:

countries = set(df["Country"])
num_industries = []
for country in countries:
    tmp = df[df["Country"] == country]
    num_industries.append((country, len(tmp)))
df["num_goods"] = np.nan
for index in df.index:
    for industry in num_industries:
        if df.ix[index]["Country"] == industry[0]:
            df.at[index, "num_goods"] = industry[1]

/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/ipykernel_launcher.py:9: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  if __name__ == '__main__':

In [22]:

df.head()

Out[22]:

	Country	Good	Child Labor	Forced Labor	industry	num_goods
0	Afghanistan	Bricks	X	X	construction	4.0
1	Afghanistan	Carpets	X	NaN	garments	4.0
2	Afghanistan	Coal	X	NaN	energy	4.0
3	Afghanistan	Flowers (poppies)	X	NaN	decorations	4.0
4	Angola	Diamonds	X	X	gems	1.0

In [23]:

new_gdp = pd.DataFrame()
new_df = pd.DataFrame()
gdp["Country"] = gdp["countries"]
for country in set(gdp["Country"]):
    if country in list(df["Country"]):
        new_gdp = new_gdp.append(gdp[gdp["Country"] == country])
for country in set(df["Country"]):
    if country in list(new_gdp["Country"]):
        tmp = df[df["Country"] == country]
        if not tmp.empty:
            new_df = new_df.append(tmp.iloc[0])
len(new_gdp), len(new_df)
new_gdp = new_gdp.sort_values("Country")
new_df = new_df.sort_values("Country")
new_gdp.index = list(range(len(new_gdp)))
new_df.index = list(range(len(new_df)))

In [24]:

from scipy import stats
print(stats.spearmanr(new_gdp["2016"], new_df["num_goods"]))
print(stats.spearmanr(new_gdp["2017"], new_df["num_goods"]))

SpearmanrResult(correlation=0.49778002485163997, pvalue=1.3498952292651301e-05)
SpearmanrResult(correlation=0.49765061609527217, pvalue=1.3579795834469717e-05)

Looking at 2016¶

In [25]:

industry_model = sm.OLS(new_gdp["2016"], new_df["num_goods"])
result = industry_model.fit()
result.summary()

Out[25]:

<caption>OLS Regression Results</caption>

Dep. Variable:	2016	R-squared:	0.165
Model:	OLS	Adj. R-squared:	0.153
Method:	Least Squares	F-statistic:	13.48
Date:	Thu, 14 Dec 2017	Prob (F-statistic):	0.000475
Time:	19:56:58	Log-Likelihood:	-593.50
No. Observations:	69	AIC:	1189.
Df Residuals:	68	BIC:	1191.
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
num_goods	87.3510	23.790	3.672	0.000	39.878	134.824

Omnibus:	140.809	Durbin-Watson:	2.043
Prob(Omnibus):	0.000	Jarque-Bera (JB):	9191.540
Skew:	7.233	Prob(JB):	0.00
Kurtosis:	57.661	Cond. No.	1.00

In [26]:

from sklearn import model_selection

X_train, X_test, y_train, y_test = model_selection.train_test_split(new_df["num_goods"], 
                                                                    new_gdp["2016"], 
                                                                    test_size=0.3, random_state=42)
X_train = X_train.reshape(-1, 1)
X_test = X_test.reshape(-1, 1)

/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/ipykernel_launcher.py:6: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
  
/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/ipykernel_launcher.py:7: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
  import sys

In [27]:

from sklearn import tree
from sklearn import metrics
import math
dtree = tree.DecisionTreeRegressor()
dtree_model = dtree.fit(X_train, y_train)
predictions = dtree_model.predict(X_test)
print("RMSE Results", math.sqrt(metrics.mean_squared_error(y_test.values, predictions)))

RMSE Results 1262.7925591605995

In [28]:

from sklearn import ensemble

original_params = {'n_estimators': 1000, 'max_leaf_nodes': 17, 'max_depth': None, 'random_state': 2,
                   'min_samples_split': 5}

setting = {'learning_rate': 0.1, 'subsample': 1.0}
params = dict(original_params)
params.update(setting)

gbr = ensemble.GradientBoostingRegressor(**params)
gbr_tree = tree.DecisionTreeRegressor()
gbr.fit(X_train, y_train)
predictions = gbr.predict(X_test)
print("RMSE Results", math.sqrt(metrics.mean_squared_error(y_test.values, predictions)))

output = gbr.predict(X_train)
gbr_tree_model = gbr_tree.fit(X_train, output)

RMSE Results 1262.7925591605986

In [29]:

from sklearn import svm

svr = svm.SVR()
svr_tree = tree.DecisionTreeRegressor()
svr.fit(X_train, y_train)
predictions = svr.predict(X_test)
print("RMSE Results", math.sqrt(metrics.mean_squared_error(y_test.values, predictions)))

output = svr.predict(X_train)
svr_tree_model = svr_tree.fit(X_train, output)

RMSE Results 95.54703493652225

In [30]:

dot_data = tree.export_graphviz(svr_tree_model, out_file=None, 
                         feature_names=["num_goods"],  
                         class_names="GDP",  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = pydotplus.graph_from_dot_data(dot_data)  
picture = Image(graph.create_png())

picture

---------------------------------------------------------------------------
InvocationException                       Traceback (most recent call last)
<ipython-input-30-1339e9e46ff6> in <module>()
      5                          special_characters=True)  
      6 graph = pydotplus.graph_from_dot_data(dot_data)
----> 7 picture = Image(graph.create_png())
      8 
      9 picture

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pydotplus/graphviz.py in <lambda>(f, prog)
   1795             self.__setattr__(
   1796                 'create_' + frmt,
-> 1797                 lambda f=frmt, prog=self.prog: self.create(format=f, prog=prog)
   1798             )
   1799             f = self.__dict__['create_' + frmt]

~/.virtualenvs/crosscompute/lib/python3.6/site-packages/pydotplus/graphviz.py in create(self, prog, format)
   1958             if self.progs is None:
   1959                 raise InvocationException(
-> 1960                     'GraphViz\'s executables not found')
   1961 
   1962         if prog not in self.progs:

InvocationException: GraphViz's executables not found

Looking At 2017¶

In [ ]:

industry_model = sm.OLS(new_gdp["2017"], new_df["num_goods"])
result = industry_model.fit()
result.summary()

In [ ]:

from sklearn import model_selection

X_train, X_test, y_train, y_test = model_selection.train_test_split(new_df["num_goods"], 
                                                                    new_gdp["2017"], 
                                                                    test_size=0.3, random_state=42)
X_train = X_train.reshape(-1, 1)
X_test = X_test.reshape(-1, 1)

In [ ]:

from sklearn import tree
from sklearn import metrics
import math
dtree = tree.DecisionTreeRegressor()
dtree_model = dtree.fit(X_train, y_train)
predictions = dtree_model.predict(X_test)
print("RMSE Results", math.sqrt(metrics.mean_squared_error(y_test.values, predictions)))

In [ ]:

from sklearn import ensemble

original_params = {'n_estimators': 1000, 'max_leaf_nodes': 17, 'max_depth': None, 'random_state': 2,
                   'min_samples_split': 5}

setting = {'learning_rate': 0.1, 'subsample': 1.0}
params = dict(original_params)
params.update(setting)

gbr = ensemble.GradientBoostingRegressor(**params)
gbr_tree = tree.DecisionTreeRegressor()
gbr.fit(X_train, y_train)
predictions = gbr.predict(X_test)
print("RMSE Results", math.sqrt(metrics.mean_squared_error(y_test.values, predictions)))

output = gbr.predict(X_train)
gbr_tree_model = gbr_tree.fit(X_train, output)

In [ ]:

from sklearn import svm

svr = svm.SVR()
svr_tree = tree.DecisionTreeRegressor()
svr.fit(X_train, y_train)
predictions = svr.predict(X_test)
print("RMSE Results", math.sqrt(metrics.mean_squared_error(y_test.values, predictions)))

output = svr.predict(X_train)
svr_tree_model = svr_tree.fit(X_train, output)

In [ ]:

dot_data = tree.export_graphviz(svr_tree_model, out_file=None, 
                         feature_names=["num_goods"],  
                         class_names="GDP",  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = pydotplus.graph_from_dot_data(dot_data)  
picture = Image(graph.create_png())

picture

Building a multidimensional analysis¶

Feature Engineering¶

Exogenous Variables¶

number of goods
number of industries
number produced by children
number produced by adults
industry (series of booleans)
country continent (coming soon!)

Endogenous Variable¶

In [ ]:

df.head()

In [ ]:

df = pd.merge(df, child_labor, on="Country")

In [ ]:

df.head()

In [ ]:

df = pd.merge(df, forced_labor, on="Country")

In [ ]:

df.head()

In [ ]:

industries = pd.get_dummies(df["industry"])
industries["Country"] = df["Country"]
df = pd.merge(df, industries, on="Country")
df.head()

In [ ]:

new_gdp = pd.DataFrame()
new_df = pd.DataFrame()
gdp["Country"] = gdp["countries"]
for country in set(gdp["Country"]):
    if country in list(df["Country"]):
        new_gdp = new_gdp.append(gdp[gdp["Country"] == country])
for country in set(df["Country"]):
    if country in list(new_gdp["Country"]):
        tmp = df[df["Country"] == country]
        if not tmp.empty:
            new_df = new_df.append(tmp.iloc[0])
len(new_gdp), len(new_df)
new_gdp = new_gdp.sort_values("Country")
new_df = new_df.sort_values("Country")
new_gdp.index = list(range(len(new_gdp)))
new_df.index = list(range(len(new_df)))

In [ ]:

new_df.head()

In [ ]:

new_df.drop("Good", axis=1, inplace=True)
new_df.drop("Country", axis=1, inplace=True)
new_df.drop("Child Labor", axis=1, inplace=True)
new_df.drop("Forced Labor", axis=1, inplace=True)
new_df.drop("industry", axis=1, inplace=True)

In [ ]:

new_df.head()

In [ ]:

multivariate_model = sm.OLS(new_gdp["2016"], new_df)
result = multivariate_model.fit()
result.summary()

In [ ]:

X = new_df[["construction", 
            "decorations",  
            "food", 
            "home_durables", 
            "medical",  
            "num_child_produced", 
            "num_forced_produced", 
            "sexual exploitation"]]
multivariate_model = sm.OLS(new_gdp["2016"], X)
result = multivariate_model.fit()
result.summary()

In [ ]:

from sklearn import model_selection

X_train, X_test, y_train, y_test = model_selection.train_test_split(new_df, 
                                                                    new_gdp["2016"], 
                                                                    test_size=0.3, random_state=42)

In [ ]:

from sklearn import tree
from sklearn import metrics
import math
dtree = tree.DecisionTreeRegressor()
dtree_model = dtree.fit(X_train, y_train)
predictions = dtree_model.predict(X_test)
print("RMSE Results", math.sqrt(metrics.mean_squared_error(y_test.values, predictions)))

In [ ]:

from sklearn import ensemble

original_params = {'n_estimators': 1000, 'max_leaf_nodes': 17, 'max_depth': None, 'random_state': 2,
                   'min_samples_split': 5}

setting = {'learning_rate': 0.1, 'subsample': 1.0}
params = dict(original_params)
params.update(setting)

gbr = ensemble.GradientBoostingRegressor(**params)
gbr_tree = tree.DecisionTreeRegressor()
gbr.fit(X_train, y_train)
predictions = gbr.predict(X_test)
print("RMSE Results", math.sqrt(metrics.mean_squared_error(y_test.values, predictions)))

output = gbr.predict(X_train)
gbr_tree_model = gbr_tree.fit(X_train, output)

In [ ]:

from sklearn import svm

svr = svm.SVR()
svr_tree = tree.DecisionTreeRegressor()
svr.fit(X_train, y_train)
predictions = svr.predict(X_test)
print("RMSE Results", math.sqrt(metrics.mean_squared_error(y_test.values, predictions)))

output = svr.predict(X_train)
svr_tree_model = svr_tree.fit(X_train, output)

In [ ]:

dot_data = tree.export_graphviz(svr_tree_model, out_file=None, 
                         feature_names=new_df.columns,  
                         class_names="GDP",  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = pydotplus.graph_from_dot_data(dot_data)  
picture = Image(graph.create_png())

picture

In [ ]:

new_df.columns
for feature in range(len(svr_tree_model.feature_importances_)):
    print(new_df.columns[feature], svr_tree_model.feature_importances_[feature])

In [ ]:

X = new_df[["num_forced_produced", "num_goods"]]
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, 
                                                                    new_gdp["2016"], 
                                                                    test_size=0.3, random_state=42)

In [ ]:

svr = svm.SVR()
svr_tree = tree.DecisionTreeRegressor()
svr.fit(X_train, y_train)
predictions = svr.predict(X_test)
print("RMSE Results", math.sqrt(metrics.mean_squared_error(y_test.values, predictions)))

output = svr.predict(X_train)
svr_tree_model = svr_tree.fit(X_train, output)

In [ ]:

dot_data = tree.export_graphviz(svr_tree_model, out_file=None, 
                         feature_names=X.columns,  
                         class_names="GDP",  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = pydotplus.graph_from_dot_data(dot_data)  
picture = Image(graph.create_png())

picture

Next Steps¶

Economic Features:

CPI,
PPP,
inflation rate,
net outflow of trade by sector,
number of industries per country,
number of registered businesses per country,
size of stock market (number of stocks traded based on that country),
And relative price of countries currency.

For social factors I was going to look at:

average education level,
population density within cities,
number of national parks,
average age of population,
reproduction rate,
main religion,
racial diversity,
gender split,
average marriage age,
percentage of people who study abroad for college,
ranking on happiness index

For technological factors I was going to look at:

level of internet connectedness,
average internet speed,
most used way to connect to the internet,
And size of internet economy (number of items shipped to this country from internet market places),

For political factors I was going to look at:

level of corruption (probably just use an index here),
majority party by population,
majority party in power,
type of government (democracy, dictatorship, etc),
voter turn out

In [ ]:

df.head()

In [ ]:

df = df.sort_values("num_goods")
set(df.iloc[-1000:]["Country"])

In [31]:

for country in set(df.iloc[-1000:]["Country"]):
    print(country, df[df["Country"] == country]["num_goods"].iloc[0])

Guatemala 6.0
Benin 2.0
India  1.0
Lesotho 1.0
North Korea 7.0
Belize 3.0
Azerbaijan 1.0
Nicaragua 7.0
Malaysia 3.0
Nigeria 6.0
Sierra Leone 5.0
Mongolia 3.0
Indonesia 7.0
Kenya 9.0
China 12.0
Argentina 11.0
Burkina Faso 3.0
Egypt 2.0
Cambodia 12.0
Cameroon 1.0
El Salvador 4.0
Madagascar 3.0
Mali 3.0
Vietnam 16.0
Tajikistan 1.0
Ethiopia 3.0
Senegal 1.0
Colombia 8.0
Peru 7.0
Suriname 1.0
Central African Republic 1.0
Liberia 2.0
Ecuador 4.0
Namibia 1.0
Zambia 5.0
Bolivia 10.0
Philippines 13.0
Lebanon 2.0
Uzbekistan 2.0
Tanzania 8.0
India 22.0
South Sudan 1.0
Nepal 4.0
Turkmenistan 1.0
Guinea 5.0
Malawi 2.0
Niger 5.0
Kazakhstan 1.0
Pakistan 9.0
Paraguay 6.0
Yemen 1.0
Thailand 5.0
Kyrgyz Republic 2.0
Mauritania 2.0
Uganda 12.0
Ghana 4.0
Honduras 3.0
Iran 2.0
Ukraine 2.0
Afghanistan 4.0
Bangladesh 15.0
Angola 1.0
Cote d'Ivoire 2.0
Russia 1.0
Dominican Republic 5.0
Mozambique 1.0
Burma 14.0
Costa Rica 2.0
Sudan 1.0
Mexico 11.0
Chad 1.0
Panama 3.0
Rwanda 1.0
Turkey 8.0
Brazil 16.0
Democratic Republic of the Congo 7.0

In [32]:

set(df[df["Country"] == 'India']["industry"])

Out[32]:

{'celebratory',
 'cigarettes',
 'construction',
 'food',
 'garments',
 'gems',
 'home_durables'}

In [33]:

set(df[df["Country"] == "India"]["Good"])

Out[33]:

{'Bidis (hand-rolled cigarettes)',
 'Brassware',
 'Bricks',
 'Carpets',
 'Cottonseed (hybrid)',
 'Embellished Textiles',
 'Fireworks',
 'Footwear',
 'Garments',
 'Gems',
 'Glass Bangles',
 'Incense (agarbatti)',
 'Leather Goods/Accessories',
 'Locks',
 'Matches',
 'Rice',
 'Silk Fabric',
 'Silk Thread',
 'Soccer Balls',
 'Stones',
 'Sugarcane',
 'Thread/Yarn'}

Pay Notebook Creator: Eric Schles	0
Set Container: Numerical CPU with TINY Memory for 10 Minutes	0
Total	0

AvWfa

Understanding our data¶

Child Labor¶

Forced Labor¶

What is the effect of number of goods which employ forced labor on GDP?¶

Looking at 2016¶

Looking At 2017¶

Building a multidimensional analysis¶

Feature Engineering¶

Exogenous Variables¶

Endogenous Variable¶

Next Steps¶