# This set of code was used to create a final dataframe with three columns:
# Species
# Type (coniferous or decidious or None)
# Link for that species
# Example of final output:
# 7,Acer pseudoplatanus,deciduous,https://en.wikipedia.org/wiki?curid=437919
# The final results were then saved as a new csv file
# As a team, we have decided that it would be faster/ more efficent to just do this process once
# We would then implement this newly created csv file into our main program
# so that we dont have to call the wikipedia API everytime
# this saves a lot of time!
import pandas as pd
import wikipedia # Used to access/ work with wikipedia API
import datetime
# Function # 1 that takes in a species name as a parameter and returns its type (coniferous or deciduous)
def LookInSummary(treeSpeciesInLatin):
# Takes a 10 millisecond pause after every call
wikipedia.set_rate_limiting(rate_limit = True, min_wait=datetime.timedelta(0, 0, 10))
# First get the summary from that species' wikipedia page
try:
summary = wikipedia.summary(treeSpeciesInLatin)
# If it finds nothing OR multiple results, just return None (do nothing)
except wikipedia.exceptions.DisambiguationError as e:
return (None)
except wikipedia.exceptions.PageError as e:
return (None)
if summary is None:
return
# If it finds deciduous or confierous in the summary, then that tree is decidouous or coniferous
if "deciduous" in summary:
return ('deciduous')
elif "coniferous" in summary:
return ('coniferous')
elif "evergreen" in summary:
return ('evergreen')
# If it can't find it in the page's summary, then it might be in the "Description" section of the page
else:
page = wikipedia.page(treeSpeciesInLatin)
if page is None:
return
description = page.section("Description")
if description is None:
return
if "deciduous" in description:
return ('deciduous')
elif "coniferous" in description:
return ('coniferous')
elif "evergreen" in summary:
return ('evergreen')
return ('None')
# Function # 2 that takes in a species name as a parameter and returns the URL of its wikipedia page
def getLink(treeSpeciesInLatin):
try:
webpage = wikipedia.page(treeSpeciesInLatin)
except wikipedia.exceptions.DisambiguationError as e:
return (None)
except wikipedia.exceptions.PageError as e:
return (None)
URL = webpage.pageid
return ('https://en.wikipedia.org/wiki?curid='+URL)
# Use the original CSV file, open it as a dataframe
df = pd.read_csv('Copy of 2015StreetTreesCensus_TREES.csv')
# Only get the 'spc_latin' column for each tree
df = df['spc_latin']
# Get a list of all of the unique species (about 132 different species)
listOfAllTreeSpecies = df.unique()
# Turn that list into a dataframe with the column heading 'Species'
df = pd.DataFrame({'Species':listOfAllTreeSpecies})
# Add a new column to the dataframe named 'Type'
# Each value in this column is the type of that species
# To get the type, we used Function # 1 from above
df['Type'] = list(map(LookInSummary, listOfAllTreeSpecies))
# If it is blank, fill it in with 'Obama', this is a safe word, as opposed to 'None'
df = df.fillna('Obama')
# Add a new column to the dataframe named 'URL Links'
# Each value in this column is the Wikipedia Link for that species
# To get the link, we used Function # 2 from above
df['URL Links'] = list(map(getLink, df['Species']))
# Exports the new dataframe as a csv file
df.to_csv('Species_Types.csv')