Count PubMed search results¶

Example queries generated¶

cat journals.txt

 Science
 Nature

cat authors.txt

 Reshma Jagsi (temple)
 salah ahmed

add affilliations for authors in parenthesis next to the name

this allows a more precise search (sadly pubmed doesn't offer much for precision)

cat keywords.txt

 poverty
 income

cat mesh.txt

 social class
 socioeconomic factors

Author tool¶

("Reshma Jagsi"[Author] AND ("temple"[Affilliation])) AND ("income"[Text Word] OR
    "poverty"[Text Word] OR "social class"[MeSH Terms] OR
        "socioeconomic factors"[MeSH Terms]) AND
        ("%s"[Date - Publication] : "%s"[Date - Publication]) AND

In [38]:

# CrossCompute
authors_text_path = 'authors.txt'
keywords_text_path = 'keywords.txt'
mesh_text_path = 'mesh.txt'
from_date = '1-1-1990'
to_date = '12-31-2015'
interval_in_years_int = 10 
target_folder = '/tmp'

In [55]:

import datetime as dt
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
from datetime import datetime
from dateutil.parser import parse as parse_date
from os.path import join

In [88]:

%matplotlib inline

In [29]:

class ToolError(Exception):
    pass

In [31]:

def get_date_ranges(from_date, to_date, interval_in_years):
    """
        Retrieve pairs of date ranges based on interval number
    """
    if from_date and to_date and from_date > to_date:
        raise ToolError('to_date must be after from_date')
    if not interval_in_years:
        return [(from_date, to_date)]
    date_ranges = []
    date_b = from_date - dt.timedelta(days=1)
    while date_b < to_date:
        date_a = date_b + dt.timedelta(days=1)
        date_b = datetime(
            date_a.year + interval_in_years, date_a.month, date_a.day,
        ) - dt.timedelta(days=1)
        if date_b > to_date:
            date_b = to_date
        date_ranges.append((date_a, date_b))
    return date_ranges

In [44]:

f = parse_date(from_date)
t = parse_date(to_date)
try:
    date_ranges = get_date_ranges(f, t, interval_in_years_int)
except ToolError as e:
    print('date_ranges.error = {0}'.format(e))
    raise ToolError
for a, b in date_ranges:
    print('%s to %s' % (a, b))

1990-01-01 00:00:00 to 1999-12-31 00:00:00
2000-01-01 00:00:00 to 2009-12-31 00:00:00
2010-01-01 00:00:00 to 2015-12-31 00:00:00

In [34]:

def load_unique_lines(source_path):
    if not source_path:
        return []
    with open(source_path, 'r') as f:
        lines = set((x.strip('\n, ;') for x in f))
    return sorted(filter(lambda x: x, lines))

In [43]:

text_words = load_unique_lines(keywords_text_path)
mesh_terms = load_unique_lines(mesh_text_path)
authors = load_unique_lines(authors_text_path)
for a in authors:
    print("'%s'" % a)

'Curtiland Deville'
'Emma Holliday'
'Reshma Jagsi'

In [53]:

def get_expression(
        author_name, from_date, to_date, 
        text_terms=None, mesh_terms=None, custom_expression=None):
    """
    Retrieve expression based on inputs.
    Expressions are constructed in this layout:

    ("%s"[Journal]) AND ("%s"[Text Word] OR
    "%s"[Text Word] OR "%s"[MeSH Terms] OR
        "%s"[MeSH Terms]) AND
        ("%s"[Date - Publication] : "%s"[Date - Publication])
    """
    expression_parts = []
    PATTERN_AFFILIATION = re.compile(r'(.+)\s*\((.+)\)')
    match = PATTERN_AFFILIATION.search(author_name)
    if match:
        author_name, affiliation_string = match.groups()
        affiliations = [x.strip() for x in affiliation_string.split(',')]
        affiliation_expression = ' OR '.join(
            '{0}[Affiliation]'.format(x) for x in affiliations)
        expression_parts.append('{0}[Author] AND ({1})'.format(
            author_name, affiliation_expression))
    else:
        expression_parts.append('{0}[Author]'.format(author_name))
    if custom_expression:
        expression_parts.append(custom_expression)
    if text_terms or mesh_terms:
        terms = []
        terms.extend('"{0}"[Text Word]'.format(x) for x in text_terms or [])
        terms.extend('"{0}"[MeSH Terms]'.format(x) for x in mesh_terms or [])
        expression_parts.append(' OR '.join(terms))
    from_date_string = from_date.strftime(
        '%Y/%m/%d')
    to_date_string = to_date.strftime(
        '%Y/%m/%d') if to_date else '3000'
    expression_parts.append(
        '"{0}"[Date - Publication] : "{1}"[Date - Publication]'.format(
            from_date_string, to_date_string))
    if len(expression_parts) <= 1:
        expression = ''.join(expression_parts)
    else:
        expression = '({0})'.format(') AND ('.join(expression_parts))
    return (expression)

In [57]:

q = get_expression(authors[0], f, t, text_words, mesh_terms)

In [58]:

def get_search_count(expression, retstart=0, retmax=1000):
    """
    Retrieve search count from page requested by url+expression
    """
    url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    # max num of articles to list
    params = {'db': 'pubmed', 'term': expression,
              'retmax': str(retmax), 'retstart': str(retstart)}
    response = requests.get(url, params=params)
    soup = BeautifulSoup(response.text, 'xml')
    count = int(soup.find('Count').next_element)
    articles_list = [str(article.next_element) for
                     article in soup.find('IdList').find_all('Id')]
    if count > (retmax + retstart):
        articles_list.extend(get_search_count(expression,
                             retstart=(retstart + retmax + 1))[1])
    return articles_list

In [64]:

l = get_search_count(q)

In [61]:

def translate_name(name):
    first_middle_last = 3
    parts_of_name = name.split(' ')
    translated_name = parts_of_name[-1] + ' ' + parts_of_name[0][0]
    if len(parts_of_name) == first_middle_last:
        translated_name += parts_of_name[1][0]
    return translated_name

In [68]:

authors[0]

Out[68]:

'Curtiland Deville'

In [69]:

translate_name(authors[0])

Out[69]:

'Deville C'

In [70]:

def get_first_name_articles(author, article_ids_list):
    # articles = list(set(articles))
    translated_name = translate_name(author)
    url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
    articles_param = ','.join(article_ids_list)
    params = {'db': 'pubmed', 'id': articles_param}
    response = requests.get(url, params=params)
    soup = BeautifulSoup(response.text, 'xml')
    first_named_articles = []
    for article_info in soup.find_all('DocSum'):
        auth = article_info.find(
                "Item", attrs={"Name": "AuthorList"}).findChild().next_element
        article = article_info.find("Id").next_element
        if auth.lower() == translated_name.lower():
            first_named_articles.append(article)
    return first_named_articles

In [71]:

get_first_name_articles(authors[0], l)

Out[71]:

[]

In [79]:

dates = []
log = []
author_articles = defaultdict(list)
counts = defaultdict(list)
keyword_counts = defaultdict(list)

In [80]:

for from_date, to_date in date_ranges:
    dates.append(pd.Timestamp(from_date))
    for item in authors:
        query_param = {'author_name': item}
        # Query totals (w/o keywords)
        item_query = get_expression(
            from_date=from_date, to_date=to_date, **query_param)
        item_articles = get_search_count(item_query)
        item_count = len(item_articles)
        query = get_expression(
            text_terms=text_words,
            mesh_terms=mesh_terms,
            from_date=from_date, to_date=to_date,
            **query_param)
        articles = get_search_count(query)
        keyword_count = len(articles)
        log.append("{query}\n{count}".format(
            query=item_query, count=item_count))
        log.append("{query}\n{count}".format(
            query=query, count=keyword_count))
        author_articles[item].extend(item_articles)
        # Get search count data for each Query (w/ keywords)
        counts[item].append(item_count)
        keyword_counts[item].append(keyword_count)

In [95]:

author_articles

Out[95]:

defaultdict(list,
            {'Curtiland Deville': ['17268590',
              '26807444',
              '26452568',
              '26323390',
              '26301524',
              '26194680',
              '26116069',
              '26068493',
              '26018727',
              '25442043',
              '25423899',
              '25413411',
              '25304288',
              '25195989',
              '25093218',
              '25087987',
              '25052501',
              '25023042',
              '24993534',
              '24411628',
              '23958148',
              '23901125',
              '23578497',
              '23363650',
              '23122983',
              '23102446',
              '22917963',
              '22851101',
              '22763029',
              '22658217',
              '22621764',
              '22381899',
              '22115790',
              '21664069',
              '21035952',
              '20171807'],
             'Emma Holliday': ['27570583',
              '26743832',
              '26705956',
              '26561041',
              '26471280',
              '26454680',
              '26452568',
              '26104927',
              '25936811',
              '25892584',
              '25835616',
              '25627289',
              '25279958',
              '25112462',
              '25035195',
              '24837890',
              '24678385',
              '24674363',
              '24667510',
              '24210670',
              '24189127',
              '23828146',
              '23224629',
              '22560553',
              '22544537'],
             'Reshma Jagsi': ['19949146',
              '19847566',
              '19826024',
              '19732688',
              '19720966',
              '19507175',
              '19434666',
              '19180636',
              '19065638',
              '18332302',
              '18332295',
              '18212304',
              '17989381',
              '17498541',
              '17448878',
              '17412197',
              '17327614',
              '17325295',
              '17238178',
              '17122470',
              '16979833',
              '16855268',
              '16702581',
              '16344418',
              '16043532',
              '15990006',
              '15798465',
              '15708241',
              '15667954',
              '15592263',
              '15465186',
              '15389481',
              '15297341',
              '15169816',
              '15093928',
              '15047076',
              '26867878',
              '26853347',
              '26853344',
              '26560679',
              '26553168',
              '26530753',
              '26452568',
              '26416998',
              '26384802',
              '26371137',
              '26279015',
              '26265174',
              '26247417',
              '26182304',
              '26181012',
              '26155944',
              '26104927',
              '25993151',
              '25892584',
              '25847940',
              '25835616',
              '25736383',
              '25733982',
              '25680609',
              '25654742',
              '25539366',
              '25539365',
              '25422489',
              '25320867',
              '25285553',
              '25195986',
              '25141939',
              '25135994',
              '25112462',
              '25104443',
              '25072452',
              '25052221',
              '25035195',
              '24941193',
              '24857095',
              '24849045',
              '24837886',
              '24777606',
              '24737273',
              '24693887',
              '24678385',
              '24667510',
              '24664468',
              '24663041',
              '24613813',
              '24586082',
              '24550418',
              '24521672',
              '24481682',
              '24468225',
              '24453295',
              '24449096',
              '24382062',
              '24362376',
              '24357525',
              '24210670',
              '24189127',
              '24161426',
              '24109522',
              '24084918',
              '24081942',
              '24072109',
              '23943890',
              '23845836',
              '23828146',
              '23765289',
              '23644481',
              '23642622',
              '23635559',
              '23542957',
              '23532233',
              '23523000',
              '23425992',
              '23425991',
              '23425990',
              '23069864',
              '23021709',
              '22913678',
              '22869890',
              '22692173',
              '22658441',
              '22436790',
              '22379423',
              '21952061',
              '21717446',
              '21644006',
              '21640329',
              '21525437',
              '21477932',
              '21445916',
              '21358962',
              '20832186',
              '20665106',
              '20421148',
              '20376554',
              '20351324',
              '19956081',
              '19847566',
              '19409733']})

In [81]:

dates

Out[81]:

[Timestamp('1990-01-01 00:00:00'),
 Timestamp('2000-01-01 00:00:00'),
 Timestamp('2010-01-01 00:00:00')]

In [82]:

counts

Out[82]:

defaultdict(list,
            {'Curtiland Deville': [0, 1, 35],
             'Emma Holliday': [0, 0, 25],
             'Reshma Jagsi': [0, 36, 104]})

In [83]:

keyword_counts

Out[83]:

defaultdict(list,
            {'Curtiland Deville': [0, 0, 1],
             'Emma Holliday': [0, 0, 4],
             'Reshma Jagsi': [0, 6, 18]})

In [85]:

index = pd.Index(dates, name='dates')
search_counts = pd.DataFrame(counts, index=index)
search_counts

Out[85]:

	Curtiland Deville	Emma Holliday	Reshma Jagsi
dates
1990-01-01	0	0	0
2000-01-01	1	0	36
2010-01-01	35	25	104

In [93]:

def saveimage(df, image_path, title):
    axes = df.plot()
    axes.set_title(title)
    figure = axes.get_figure()
    figure.savefig(image_path)

In [99]:

search_count_path = join(target_folder, 'search_counts.csv')
search_counts.to_csv(search_count_path)
print("search_count_table_path = " + search_count_path)
if interval_in_years_int:
    title = 'Article Counts over time'
    image_path = join(target_folder, 'article_count.png')
    saveimage(search_counts, image_path, title)
    print('plot_image_path = ' + image_path)

search_count_table_path = /tmp/search_counts.csv
plot_image_path = /tmp/article_count.png

In [86]:

keyword_search_counts = pd.DataFrame(keyword_counts, index=index)
keyword_search_counts

Out[86]:

	Curtiland Deville	Emma Holliday	Reshma Jagsi
dates
1990-01-01	0	0	0
2000-01-01	0	0	6
2010-01-01	1	4	18

In [94]:

keyword_search_count_path = join(
    target_folder, 'keyword_search_counts.csv')
keyword_search_counts.to_csv(keyword_search_count_path)
print("keyword_search_count_table_path = " + keyword_search_count_path)
if interval_in_years_int:
    title = 'Article Counts over time with Keywords'
    image_path = join(target_folder, 'keyword_article_count.png')
    saveimage(
        keyword_search_counts,
        image_path,
        title)
    print('keywords_plot_image_path = ' + image_path)

keyword_search_count_table_path = /tmp/keyword_search_counts.csv
keywords_plot_image_path = /tmp/keyword_article_count.png

In [100]:

cols = ['Author', 'No. first name articles']
first_name_articles = [
    (name, len(
        get_first_name_articles(name, author_articles[name])))
    for name in authors]
df = pd.DataFrame(first_name_articles, columns=cols)
df

Out[100]:

	Author	No. first name articles
0	Curtiland Deville	7
1	Emma Holliday	3
2	Reshma Jagsi	67

In [101]:

first_name_path = join(
    target_folder, 'first_named_articles.csv')
df.to_csv(first_name_path, index=False)
print("first_name_articles_table_path = " + first_name_path)

first_name_articles_table_path = /tmp/first_named_articles.csv

In [108]:

log[:5]

Out[108]:

['(Curtiland Deville[Author]) AND ("1990/01/01"[Date - Publication] : "1999/12/31"[Date - Publication])\n0',
 '(Curtiland Deville[Author]) AND ("income"[Text Word] OR "poverty"[Text Word] OR "social class"[MeSH Terms] OR "socioeconomic factors"[MeSH Terms]) AND ("1990/01/01"[Date - Publication] : "1999/12/31"[Date - Publication])\n0',
 '(Emma Holliday[Author]) AND ("1990/01/01"[Date - Publication] : "1999/12/31"[Date - Publication])\n0',
 '(Emma Holliday[Author]) AND ("income"[Text Word] OR "poverty"[Text Word] OR "social class"[MeSH Terms] OR "socioeconomic factors"[MeSH Terms]) AND ("1990/01/01"[Date - Publication] : "1999/12/31"[Date - Publication])\n0',
 '(Reshma Jagsi[Author]) AND ("1990/01/01"[Date - Publication] : "1999/12/31"[Date - Publication])\n0']

In [103]:

log_path = join(target_folder, 'log.txt')
with open(log_path, 'w') as f:
    f.write('\n\n'.join(log))
print('log_text_path = ' + log_path)

log_text_path = /tmp/log.txt

Pay Notebook Creator: Salah Ahmed	0
Set Container: Numerical CPU with TINY Memory for 10 Minutes	0
Total	0

tabulate keywords

Count PubMed search results¶

Example queries generated¶

Author tool¶