cat journals.txt
Science
Nature
cat authors.txt
Reshma Jagsi (temple)
salah ahmed
add affilliations for authors in parenthesis next to the name
this allows a more precise search (sadly pubmed doesn't offer much for precision)
cat keywords.txt
poverty
income
cat mesh.txt
social class
socioeconomic factors
("Reshma Jagsi"[Author] AND ("temple"[Affilliation])) AND ("income"[Text Word] OR
"poverty"[Text Word] OR "social class"[MeSH Terms] OR
"socioeconomic factors"[MeSH Terms]) AND
("%s"[Date - Publication] : "%s"[Date - Publication]) AND
# CrossCompute
authors_text_path = 'authors.txt'
keywords_text_path = 'keywords.txt'
mesh_text_path = 'mesh.txt'
from_date = '1-1-1990'
to_date = '12-31-2015'
interval_in_years_int = 10
target_folder = '/tmp'
import datetime as dt
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
from datetime import datetime
from dateutil.parser import parse as parse_date
from os.path import join
%matplotlib inline
class ToolError(Exception):
pass
def get_date_ranges(from_date, to_date, interval_in_years):
"""
Retrieve pairs of date ranges based on interval number
"""
if from_date and to_date and from_date > to_date:
raise ToolError('to_date must be after from_date')
if not interval_in_years:
return [(from_date, to_date)]
date_ranges = []
date_b = from_date - dt.timedelta(days=1)
while date_b < to_date:
date_a = date_b + dt.timedelta(days=1)
date_b = datetime(
date_a.year + interval_in_years, date_a.month, date_a.day,
) - dt.timedelta(days=1)
if date_b > to_date:
date_b = to_date
date_ranges.append((date_a, date_b))
return date_ranges
f = parse_date(from_date)
t = parse_date(to_date)
try:
date_ranges = get_date_ranges(f, t, interval_in_years_int)
except ToolError as e:
print('date_ranges.error = {0}'.format(e))
raise ToolError
for a, b in date_ranges:
print('%s to %s' % (a, b))
def load_unique_lines(source_path):
if not source_path:
return []
with open(source_path, 'r') as f:
lines = set((x.strip('\n, ;') for x in f))
return sorted(filter(lambda x: x, lines))
text_words = load_unique_lines(keywords_text_path)
mesh_terms = load_unique_lines(mesh_text_path)
authors = load_unique_lines(authors_text_path)
for a in authors:
print("'%s'" % a)
def get_expression(
author_name, from_date, to_date,
text_terms=None, mesh_terms=None, custom_expression=None):
"""
Retrieve expression based on inputs.
Expressions are constructed in this layout:
("%s"[Journal]) AND ("%s"[Text Word] OR
"%s"[Text Word] OR "%s"[MeSH Terms] OR
"%s"[MeSH Terms]) AND
("%s"[Date - Publication] : "%s"[Date - Publication])
"""
expression_parts = []
PATTERN_AFFILIATION = re.compile(r'(.+)\s*\((.+)\)')
match = PATTERN_AFFILIATION.search(author_name)
if match:
author_name, affiliation_string = match.groups()
affiliations = [x.strip() for x in affiliation_string.split(',')]
affiliation_expression = ' OR '.join(
'{0}[Affiliation]'.format(x) for x in affiliations)
expression_parts.append('{0}[Author] AND ({1})'.format(
author_name, affiliation_expression))
else:
expression_parts.append('{0}[Author]'.format(author_name))
if custom_expression:
expression_parts.append(custom_expression)
if text_terms or mesh_terms:
terms = []
terms.extend('"{0}"[Text Word]'.format(x) for x in text_terms or [])
terms.extend('"{0}"[MeSH Terms]'.format(x) for x in mesh_terms or [])
expression_parts.append(' OR '.join(terms))
from_date_string = from_date.strftime(
'%Y/%m/%d')
to_date_string = to_date.strftime(
'%Y/%m/%d') if to_date else '3000'
expression_parts.append(
'"{0}"[Date - Publication] : "{1}"[Date - Publication]'.format(
from_date_string, to_date_string))
if len(expression_parts) <= 1:
expression = ''.join(expression_parts)
else:
expression = '({0})'.format(') AND ('.join(expression_parts))
return (expression)
q = get_expression(authors[0], f, t, text_words, mesh_terms)
def get_search_count(expression, retstart=0, retmax=1000):
"""
Retrieve search count from page requested by url+expression
"""
url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
# max num of articles to list
params = {'db': 'pubmed', 'term': expression,
'retmax': str(retmax), 'retstart': str(retstart)}
response = requests.get(url, params=params)
soup = BeautifulSoup(response.text, 'xml')
count = int(soup.find('Count').next_element)
articles_list = [str(article.next_element) for
article in soup.find('IdList').find_all('Id')]
if count > (retmax + retstart):
articles_list.extend(get_search_count(expression,
retstart=(retstart + retmax + 1))[1])
return articles_list
l = get_search_count(q)
def translate_name(name):
first_middle_last = 3
parts_of_name = name.split(' ')
translated_name = parts_of_name[-1] + ' ' + parts_of_name[0][0]
if len(parts_of_name) == first_middle_last:
translated_name += parts_of_name[1][0]
return translated_name
authors[0]
translate_name(authors[0])
def get_first_name_articles(author, article_ids_list):
# articles = list(set(articles))
translated_name = translate_name(author)
url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
articles_param = ','.join(article_ids_list)
params = {'db': 'pubmed', 'id': articles_param}
response = requests.get(url, params=params)
soup = BeautifulSoup(response.text, 'xml')
first_named_articles = []
for article_info in soup.find_all('DocSum'):
auth = article_info.find(
"Item", attrs={"Name": "AuthorList"}).findChild().next_element
article = article_info.find("Id").next_element
if auth.lower() == translated_name.lower():
first_named_articles.append(article)
return first_named_articles
get_first_name_articles(authors[0], l)
dates = []
log = []
author_articles = defaultdict(list)
counts = defaultdict(list)
keyword_counts = defaultdict(list)
for from_date, to_date in date_ranges:
dates.append(pd.Timestamp(from_date))
for item in authors:
query_param = {'author_name': item}
# Query totals (w/o keywords)
item_query = get_expression(
from_date=from_date, to_date=to_date, **query_param)
item_articles = get_search_count(item_query)
item_count = len(item_articles)
query = get_expression(
text_terms=text_words,
mesh_terms=mesh_terms,
from_date=from_date, to_date=to_date,
**query_param)
articles = get_search_count(query)
keyword_count = len(articles)
log.append("{query}\n{count}".format(
query=item_query, count=item_count))
log.append("{query}\n{count}".format(
query=query, count=keyword_count))
author_articles[item].extend(item_articles)
# Get search count data for each Query (w/ keywords)
counts[item].append(item_count)
keyword_counts[item].append(keyword_count)
author_articles
dates
counts
keyword_counts
index = pd.Index(dates, name='dates')
search_counts = pd.DataFrame(counts, index=index)
search_counts
def saveimage(df, image_path, title):
axes = df.plot()
axes.set_title(title)
figure = axes.get_figure()
figure.savefig(image_path)
search_count_path = join(target_folder, 'search_counts.csv')
search_counts.to_csv(search_count_path)
print("search_count_table_path = " + search_count_path)
if interval_in_years_int:
title = 'Article Counts over time'
image_path = join(target_folder, 'article_count.png')
saveimage(search_counts, image_path, title)
print('plot_image_path = ' + image_path)
keyword_search_counts = pd.DataFrame(keyword_counts, index=index)
keyword_search_counts
keyword_search_count_path = join(
target_folder, 'keyword_search_counts.csv')
keyword_search_counts.to_csv(keyword_search_count_path)
print("keyword_search_count_table_path = " + keyword_search_count_path)
if interval_in_years_int:
title = 'Article Counts over time with Keywords'
image_path = join(target_folder, 'keyword_article_count.png')
saveimage(
keyword_search_counts,
image_path,
title)
print('keywords_plot_image_path = ' + image_path)
cols = ['Author', 'No. first name articles']
first_name_articles = [
(name, len(
get_first_name_articles(name, author_articles[name])))
for name in authors]
df = pd.DataFrame(first_name_articles, columns=cols)
df
first_name_path = join(
target_folder, 'first_named_articles.csv')
df.to_csv(first_name_path, index=False)
print("first_name_articles_table_path = " + first_name_path)
log[:5]
log_path = join(target_folder, 'log.txt')
with open(log_path, 'w') as f:
f.write('\n\n'.join(log))
print('log_text_path = ' + log_path)