tabulate keywords




Pay Notebook Creator: Salah Ahmed0
Set Container: Numerical CPU with TINY Memory for 10 Minutes 0
Total0

Count PubMed search results

Example queries generated

cat journals.txt

 Science
 Nature

cat authors.txt

 Reshma Jagsi (temple)
 salah ahmed

add affilliations for authors in parenthesis next to the name

this allows a more precise search (sadly pubmed doesn't offer much for precision)

cat keywords.txt

 poverty
 income

cat mesh.txt

 social class
 socioeconomic factors

Author tool

("Reshma Jagsi"[Author] AND ("temple"[Affilliation])) AND ("income"[Text Word] OR
    "poverty"[Text Word] OR "social class"[MeSH Terms] OR
        "socioeconomic factors"[MeSH Terms]) AND
        ("%s"[Date - Publication] : "%s"[Date - Publication]) AND
In [38]:
# CrossCompute
authors_text_path = 'authors.txt'
keywords_text_path = 'keywords.txt'
mesh_text_path = 'mesh.txt'
from_date = '1-1-1990'
to_date = '12-31-2015'
interval_in_years_int = 10 
target_folder = '/tmp'
In [55]:
import datetime as dt
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
from datetime import datetime
from dateutil.parser import parse as parse_date
from os.path import join
In [88]:
%matplotlib inline
In [29]:
class ToolError(Exception):
    pass
In [31]:
def get_date_ranges(from_date, to_date, interval_in_years):
    """
        Retrieve pairs of date ranges based on interval number
    """
    if from_date and to_date and from_date > to_date:
        raise ToolError('to_date must be after from_date')
    if not interval_in_years:
        return [(from_date, to_date)]
    date_ranges = []
    date_b = from_date - dt.timedelta(days=1)
    while date_b < to_date:
        date_a = date_b + dt.timedelta(days=1)
        date_b = datetime(
            date_a.year + interval_in_years, date_a.month, date_a.day,
        ) - dt.timedelta(days=1)
        if date_b > to_date:
            date_b = to_date
        date_ranges.append((date_a, date_b))
    return date_ranges
In [44]:
f = parse_date(from_date)
t = parse_date(to_date)
try:
    date_ranges = get_date_ranges(f, t, interval_in_years_int)
except ToolError as e:
    print('date_ranges.error = {0}'.format(e))
    raise ToolError
for a, b in date_ranges:
    print('%s to %s' % (a, b))
1990-01-01 00:00:00 to 1999-12-31 00:00:00
2000-01-01 00:00:00 to 2009-12-31 00:00:00
2010-01-01 00:00:00 to 2015-12-31 00:00:00
In [34]:
def load_unique_lines(source_path):
    if not source_path:
        return []
    with open(source_path, 'r') as f:
        lines = set((x.strip('\n, ;') for x in f))
    return sorted(filter(lambda x: x, lines))
In [43]:
text_words = load_unique_lines(keywords_text_path)
mesh_terms = load_unique_lines(mesh_text_path)
authors = load_unique_lines(authors_text_path)
for a in authors:
    print("'%s'" % a)
'Curtiland Deville'
'Emma Holliday'
'Reshma Jagsi'
In [53]:
def get_expression(
        author_name, from_date, to_date, 
        text_terms=None, mesh_terms=None, custom_expression=None):
    """
    Retrieve expression based on inputs.
    Expressions are constructed in this layout:

    ("%s"[Journal]) AND ("%s"[Text Word] OR
    "%s"[Text Word] OR "%s"[MeSH Terms] OR
        "%s"[MeSH Terms]) AND
        ("%s"[Date - Publication] : "%s"[Date - Publication])
    """
    expression_parts = []
    PATTERN_AFFILIATION = re.compile(r'(.+)\s*\((.+)\)')
    match = PATTERN_AFFILIATION.search(author_name)
    if match:
        author_name, affiliation_string = match.groups()
        affiliations = [x.strip() for x in affiliation_string.split(',')]
        affiliation_expression = ' OR '.join(
            '{0}[Affiliation]'.format(x) for x in affiliations)
        expression_parts.append('{0}[Author] AND ({1})'.format(
            author_name, affiliation_expression))
    else:
        expression_parts.append('{0}[Author]'.format(author_name))
    if custom_expression:
        expression_parts.append(custom_expression)
    if text_terms or mesh_terms:
        terms = []
        terms.extend('"{0}"[Text Word]'.format(x) for x in text_terms or [])
        terms.extend('"{0}"[MeSH Terms]'.format(x) for x in mesh_terms or [])
        expression_parts.append(' OR '.join(terms))
    from_date_string = from_date.strftime(
        '%Y/%m/%d')
    to_date_string = to_date.strftime(
        '%Y/%m/%d') if to_date else '3000'
    expression_parts.append(
        '"{0}"[Date - Publication] : "{1}"[Date - Publication]'.format(
            from_date_string, to_date_string))
    if len(expression_parts) <= 1:
        expression = ''.join(expression_parts)
    else:
        expression = '({0})'.format(') AND ('.join(expression_parts))
    return (expression)
In [57]:
q = get_expression(authors[0], f, t, text_words, mesh_terms)
In [58]:
def get_search_count(expression, retstart=0, retmax=1000):
    """
    Retrieve search count from page requested by url+expression
    """
    url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    # max num of articles to list
    params = {'db': 'pubmed', 'term': expression,
              'retmax': str(retmax), 'retstart': str(retstart)}
    response = requests.get(url, params=params)
    soup = BeautifulSoup(response.text, 'xml')
    count = int(soup.find('Count').next_element)
    articles_list = [str(article.next_element) for
                     article in soup.find('IdList').find_all('Id')]
    if count > (retmax + retstart):
        articles_list.extend(get_search_count(expression,
                             retstart=(retstart + retmax + 1))[1])
    return articles_list
In [64]:
l = get_search_count(q)
In [61]:
def translate_name(name):
    first_middle_last = 3
    parts_of_name = name.split(' ')
    translated_name = parts_of_name[-1] + ' ' + parts_of_name[0][0]
    if len(parts_of_name) == first_middle_last:
        translated_name += parts_of_name[1][0]
    return translated_name
In [68]:
authors[0]
Out[68]:
'Curtiland Deville'
In [69]:
translate_name(authors[0])
Out[69]:
'Deville C'
In [70]:
def get_first_name_articles(author, article_ids_list):
    # articles = list(set(articles))
    translated_name = translate_name(author)
    url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
    articles_param = ','.join(article_ids_list)
    params = {'db': 'pubmed', 'id': articles_param}
    response = requests.get(url, params=params)
    soup = BeautifulSoup(response.text, 'xml')
    first_named_articles = []
    for article_info in soup.find_all('DocSum'):
        auth = article_info.find(
                "Item", attrs={"Name": "AuthorList"}).findChild().next_element
        article = article_info.find("Id").next_element
        if auth.lower() == translated_name.lower():
            first_named_articles.append(article)
    return first_named_articles
In [71]:
get_first_name_articles(authors[0], l)
Out[71]:
[]
In [79]:
dates = []
log = []
author_articles = defaultdict(list)
counts = defaultdict(list)
keyword_counts = defaultdict(list)
In [80]:
for from_date, to_date in date_ranges:
    dates.append(pd.Timestamp(from_date))
    for item in authors:
        query_param = {'author_name': item}
        # Query totals (w/o keywords)
        item_query = get_expression(
            from_date=from_date, to_date=to_date, **query_param)
        item_articles = get_search_count(item_query)
        item_count = len(item_articles)
        query = get_expression(
            text_terms=text_words,
            mesh_terms=mesh_terms,
            from_date=from_date, to_date=to_date,
            **query_param)
        articles = get_search_count(query)
        keyword_count = len(articles)
        log.append("{query}\n{count}".format(
            query=item_query, count=item_count))
        log.append("{query}\n{count}".format(
            query=query, count=keyword_count))
        author_articles[item].extend(item_articles)
        # Get search count data for each Query (w/ keywords)
        counts[item].append(item_count)
        keyword_counts[item].append(keyword_count)
In [95]:
author_articles
Out[95]:
defaultdict(list,
            {'Curtiland Deville': ['17268590',
              '26807444',
              '26452568',
              '26323390',
              '26301524',
              '26194680',
              '26116069',
              '26068493',
              '26018727',
              '25442043',
              '25423899',
              '25413411',
              '25304288',
              '25195989',
              '25093218',
              '25087987',
              '25052501',
              '25023042',
              '24993534',
              '24411628',
              '23958148',
              '23901125',
              '23578497',
              '23363650',
              '23122983',
              '23102446',
              '22917963',
              '22851101',
              '22763029',
              '22658217',
              '22621764',
              '22381899',
              '22115790',
              '21664069',
              '21035952',
              '20171807'],
             'Emma Holliday': ['27570583',
              '26743832',
              '26705956',
              '26561041',
              '26471280',
              '26454680',
              '26452568',
              '26104927',
              '25936811',
              '25892584',
              '25835616',
              '25627289',
              '25279958',
              '25112462',
              '25035195',
              '24837890',
              '24678385',
              '24674363',
              '24667510',
              '24210670',
              '24189127',
              '23828146',
              '23224629',
              '22560553',
              '22544537'],
             'Reshma Jagsi': ['19949146',
              '19847566',
              '19826024',
              '19732688',
              '19720966',
              '19507175',
              '19434666',
              '19180636',
              '19065638',
              '18332302',
              '18332295',
              '18212304',
              '17989381',
              '17498541',
              '17448878',
              '17412197',
              '17327614',
              '17325295',
              '17238178',
              '17122470',
              '16979833',
              '16855268',
              '16702581',
              '16344418',
              '16043532',
              '15990006',
              '15798465',
              '15708241',
              '15667954',
              '15592263',
              '15465186',
              '15389481',
              '15297341',
              '15169816',
              '15093928',
              '15047076',
              '26867878',
              '26853347',
              '26853344',
              '26560679',
              '26553168',
              '26530753',
              '26452568',
              '26416998',
              '26384802',
              '26371137',
              '26279015',
              '26265174',
              '26247417',
              '26182304',
              '26181012',
              '26155944',
              '26104927',
              '25993151',
              '25892584',
              '25847940',
              '25835616',
              '25736383',
              '25733982',
              '25680609',
              '25654742',
              '25539366',
              '25539365',
              '25422489',
              '25320867',
              '25285553',
              '25195986',
              '25141939',
              '25135994',
              '25112462',
              '25104443',
              '25072452',
              '25052221',
              '25035195',
              '24941193',
              '24857095',
              '24849045',
              '24837886',
              '24777606',
              '24737273',
              '24693887',
              '24678385',
              '24667510',
              '24664468',
              '24663041',
              '24613813',
              '24586082',
              '24550418',
              '24521672',
              '24481682',
              '24468225',
              '24453295',
              '24449096',
              '24382062',
              '24362376',
              '24357525',
              '24210670',
              '24189127',
              '24161426',
              '24109522',
              '24084918',
              '24081942',
              '24072109',
              '23943890',
              '23845836',
              '23828146',
              '23765289',
              '23644481',
              '23642622',
              '23635559',
              '23542957',
              '23532233',
              '23523000',
              '23425992',
              '23425991',
              '23425990',
              '23069864',
              '23021709',
              '22913678',
              '22869890',
              '22692173',
              '22658441',
              '22436790',
              '22379423',
              '21952061',
              '21717446',
              '21644006',
              '21640329',
              '21525437',
              '21477932',
              '21445916',
              '21358962',
              '20832186',
              '20665106',
              '20421148',
              '20376554',
              '20351324',
              '19956081',
              '19847566',
              '19409733']})
In [81]:
dates
Out[81]:
[Timestamp('1990-01-01 00:00:00'),
 Timestamp('2000-01-01 00:00:00'),
 Timestamp('2010-01-01 00:00:00')]
In [82]:
counts
Out[82]:
defaultdict(list,
            {'Curtiland Deville': [0, 1, 35],
             'Emma Holliday': [0, 0, 25],
             'Reshma Jagsi': [0, 36, 104]})
In [83]:
keyword_counts
Out[83]:
defaultdict(list,
            {'Curtiland Deville': [0, 0, 1],
             'Emma Holliday': [0, 0, 4],
             'Reshma Jagsi': [0, 6, 18]})
In [85]:
index = pd.Index(dates, name='dates')
search_counts = pd.DataFrame(counts, index=index)
search_counts
Out[85]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Curtiland Deville Emma Holliday Reshma Jagsi
dates
1990-01-01 0 0 0
2000-01-01 1 0 36
2010-01-01 35 25 104
In [93]:
def saveimage(df, image_path, title):
    axes = df.plot()
    axes.set_title(title)
    figure = axes.get_figure()
    figure.savefig(image_path)
In [99]:
search_count_path = join(target_folder, 'search_counts.csv')
search_counts.to_csv(search_count_path)
print("search_count_table_path = " + search_count_path)
if interval_in_years_int:
    title = 'Article Counts over time'
    image_path = join(target_folder, 'article_count.png')
    saveimage(search_counts, image_path, title)
    print('plot_image_path = ' + image_path)
search_count_table_path = /tmp/search_counts.csv
plot_image_path = /tmp/article_count.png
In [86]:
keyword_search_counts = pd.DataFrame(keyword_counts, index=index)
keyword_search_counts
Out[86]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Curtiland Deville Emma Holliday Reshma Jagsi
dates
1990-01-01 0 0 0
2000-01-01 0 0 6
2010-01-01 1 4 18
In [94]:
keyword_search_count_path = join(
    target_folder, 'keyword_search_counts.csv')
keyword_search_counts.to_csv(keyword_search_count_path)
print("keyword_search_count_table_path = " + keyword_search_count_path)
if interval_in_years_int:
    title = 'Article Counts over time with Keywords'
    image_path = join(target_folder, 'keyword_article_count.png')
    saveimage(
        keyword_search_counts,
        image_path,
        title)
    print('keywords_plot_image_path = ' + image_path)
keyword_search_count_table_path = /tmp/keyword_search_counts.csv
keywords_plot_image_path = /tmp/keyword_article_count.png
In [100]:
cols = ['Author', 'No. first name articles']
first_name_articles = [
    (name, len(
        get_first_name_articles(name, author_articles[name])))
    for name in authors]
df = pd.DataFrame(first_name_articles, columns=cols)
df
Out[100]:
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Author No. first name articles
0 Curtiland Deville 7
1 Emma Holliday 3
2 Reshma Jagsi 67
In [101]:
first_name_path = join(
    target_folder, 'first_named_articles.csv')
df.to_csv(first_name_path, index=False)
print("first_name_articles_table_path = " + first_name_path)
first_name_articles_table_path = /tmp/first_named_articles.csv
In [108]:
log[:5]
Out[108]:
['(Curtiland Deville[Author]) AND ("1990/01/01"[Date - Publication] : "1999/12/31"[Date - Publication])\n0',
 '(Curtiland Deville[Author]) AND ("income"[Text Word] OR "poverty"[Text Word] OR "social class"[MeSH Terms] OR "socioeconomic factors"[MeSH Terms]) AND ("1990/01/01"[Date - Publication] : "1999/12/31"[Date - Publication])\n0',
 '(Emma Holliday[Author]) AND ("1990/01/01"[Date - Publication] : "1999/12/31"[Date - Publication])\n0',
 '(Emma Holliday[Author]) AND ("income"[Text Word] OR "poverty"[Text Word] OR "social class"[MeSH Terms] OR "socioeconomic factors"[MeSH Terms]) AND ("1990/01/01"[Date - Publication] : "1999/12/31"[Date - Publication])\n0',
 '(Reshma Jagsi[Author]) AND ("1990/01/01"[Date - Publication] : "1999/12/31"[Date - Publication])\n0']
In [103]:
log_path = join(target_folder, 'log.txt')
with open(log_path, 'w') as f:
    f.write('\n\n'.join(log))
print('log_text_path = ' + log_path)
log_text_path = /tmp/log.txt