web-scraping with BeautifulSoup




Pay Notebook Creator: Salah Ahmed0
Set Container: Numerical CPU with TINY Memory for 10 Minutes 0
Total0

Using python to scrape data from pubmed

{query: query ? keyword to search for on pubmed}

In [3]:
# Crosscompute
query = 'hypnosis'
target_folder = '.'
In [4]:
import requests
from os.path import join
from bs4 import BeautifulSoup
from pandas import DataFrame
In [5]:
url = 'https://www.ncbi.nlm.nih.gov/pubmed/'
In [6]:
# get html from a webpage, using key="term", value=query
r = requests.get(url, dict(term='query'))
# create BeautifulSoup object
soup = BeautifulSoup(r.content, 'lxml')
# get titles of articles from html, titles are the "text" in every class named "title"
In [9]:
tags = soup.findAll(class_='title')
tags[:5]
Out[9]:
[<p class="title"><a href="/pubmed/29206167" ref="ordinalpos=1&amp;ncbi_uid=29206167&amp;link_uid=29206167&amp;linksrc=docsum_title">scRNASeqDB: A Database for RNA-Seq Based Gene Expression Profiles in Human Single Cells.</a></p>,
 <p class="title"><a href="/pubmed/29202782" ref="ordinalpos=2&amp;ncbi_uid=29202782&amp;link_uid=29202782&amp;linksrc=docsum_title">Studying de-implementation in health: an analysis of funded research grants.</a></p>,
 <p class="title"><a href="/pubmed/29202689" ref="ordinalpos=3&amp;ncbi_uid=29202689&amp;link_uid=29202689&amp;linksrc=docsum_title">Explorative visual analytics on interval-based genomic data and their metadata.</a></p>,
 <p class="title"><a href="/pubmed/29199060" ref="ordinalpos=4&amp;ncbi_uid=29199060&amp;link_uid=29199060&amp;linksrc=docsum_title">Total Joint Arthroplasty in Patients With Parkinson's Disease: Survivorship, Outcomes, and Reasons for Failure.</a></p>,
 <p class="title"><a href="/pubmed/29191407" ref="ordinalpos=5&amp;ncbi_uid=29191407&amp;link_uid=29191407&amp;linksrc=docsum_title">Occupational outcomes following combat-related gunshot injury: Cohort study.</a></p>]
In [10]:
titles = [t.text for t in tags]
titles[:5]
Out[10]:
['scRNASeqDB: A Database for RNA-Seq Based Gene Expression Profiles in Human Single Cells.',
 'Studying de-implementation in health: an analysis of funded research grants.',
 'Explorative visual analytics on interval-based genomic data and their metadata.',
 "Total Joint Arthroplasty in Patients With Parkinson's Disease: Survivorship, Outcomes, and Reasons for Failure.",
 'Occupational outcomes following combat-related gunshot injury: Cohort study.']
In [11]:
# create dataframe object, set appropriate column name, export to csv file
df = DataFrame(titles, columns=['Title of Article'])
df
Out[11]:
<style> .dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; } </style>
Title of Article
0 scRNASeqDB: A Database for RNA-Seq Based Gene ...
1 Studying de-implementation in health: an analy...
2 Explorative visual analytics on interval-based...
3 Total Joint Arthroplasty in Patients With Park...
4 Occupational outcomes following combat-related...
5 Incidence and Risk Factors of Intracranial Hem...
6 Pediatric Supracondylar Humerus Fractures: AAO...
7 glactools: a command-line toolset for the mana...
8 SATORI: A System for Ontology-Guided Visual Ex...
9 Electronic Health Record Documentation of Nurs...
10 Consensus queries in ligand-based virtual scre...
11 Demographic and clinical factors associated wi...
12 Validation of a claims-based algorithm to char...
13 Potential role of Müller cells in the pathogen...
14 An efficient error correction algorithm using ...
15 [Progression of chronic renal disease in a ref...
16 The Belief and Attitude of the Drivers Toward ...
17 Programmed biomolecule delivery to enable and ...
18 Trends and Predictors of National Institutes o...
19 A cloud-based framework for large-scale tradit...
In [4]:
df.to_csv(path, index=False)
In [12]:
path = join(target_folder, 'titles-%s.csv' % query)
print('titles_table_path = %s' % path)
titles_table_path = ./titles-hypnosis.csv