In [3]:

# Crosscompute
query = 'hypnosis'
target_folder = '.'

In [4]:

import requests
from os.path import join
from bs4 import BeautifulSoup
from pandas import DataFrame

In [5]:

url = 'https://www.ncbi.nlm.nih.gov/pubmed/'

In [6]:

# get html from a webpage, using key="term", value=query
r = requests.get(url, dict(term='query'))
# create BeautifulSoup object
soup = BeautifulSoup(r.content, 'lxml')
# get titles of articles from html, titles are the "text" in every class named "title"

In [9]:

tags = soup.findAll(class_='title')
tags[:5]

Out[9]:

[<p class="title"><a href="/pubmed/29206167" ref="ordinalpos=1&amp;ncbi_uid=29206167&amp;link_uid=29206167&amp;linksrc=docsum_title">scRNASeqDB: A Database for RNA-Seq Based Gene Expression Profiles in Human Single Cells.</a></p>,
 <p class="title"><a href="/pubmed/29202782" ref="ordinalpos=2&amp;ncbi_uid=29202782&amp;link_uid=29202782&amp;linksrc=docsum_title">Studying de-implementation in health: an analysis of funded research grants.</a></p>,
 <p class="title"><a href="/pubmed/29202689" ref="ordinalpos=3&amp;ncbi_uid=29202689&amp;link_uid=29202689&amp;linksrc=docsum_title">Explorative visual analytics on interval-based genomic data and their metadata.</a></p>,
 <p class="title"><a href="/pubmed/29199060" ref="ordinalpos=4&amp;ncbi_uid=29199060&amp;link_uid=29199060&amp;linksrc=docsum_title">Total Joint Arthroplasty in Patients With Parkinson's Disease: Survivorship, Outcomes, and Reasons for Failure.</a></p>,
 <p class="title"><a href="/pubmed/29191407" ref="ordinalpos=5&amp;ncbi_uid=29191407&amp;link_uid=29191407&amp;linksrc=docsum_title">Occupational outcomes following combat-related gunshot injury: Cohort study.</a></p>]

In [10]:

titles = [t.text for t in tags]
titles[:5]

Out[10]:

['scRNASeqDB: A Database for RNA-Seq Based Gene Expression Profiles in Human Single Cells.',
 'Studying de-implementation in health: an analysis of funded research grants.',
 'Explorative visual analytics on interval-based genomic data and their metadata.',
 "Total Joint Arthroplasty in Patients With Parkinson's Disease: Survivorship, Outcomes, and Reasons for Failure.",
 'Occupational outcomes following combat-related gunshot injury: Cohort study.']

In [11]:

# create dataframe object, set appropriate column name, export to csv file
df = DataFrame(titles, columns=['Title of Article'])
df

Out[11]:

	Title of Article
0	scRNASeqDB: A Database for RNA-Seq Based Gene ...
1	Studying de-implementation in health: an analy...
2	Explorative visual analytics on interval-based...
3	Total Joint Arthroplasty in Patients With Park...
4	Occupational outcomes following combat-related...
5	Incidence and Risk Factors of Intracranial Hem...
6	Pediatric Supracondylar Humerus Fractures: AAO...
7	glactools: a command-line toolset for the mana...
8	SATORI: A System for Ontology-Guided Visual Ex...
9	Electronic Health Record Documentation of Nurs...
10	Consensus queries in ligand-based virtual scre...
11	Demographic and clinical factors associated wi...
12	Validation of a claims-based algorithm to char...
13	Potential role of Müller cells in the pathogen...
14	An efficient error correction algorithm using ...
15	[Progression of chronic renal disease in a ref...
16	The Belief and Attitude of the Drivers Toward ...
17	Programmed biomolecule delivery to enable and ...
18	Trends and Predictors of National Institutes o...
19	A cloud-based framework for large-scale tradit...

In [4]:

df.to_csv(path, index=False)

In [12]:

path = join(target_folder, 'titles-%s.csv' % query)
print('titles_table_path = %s' % path)

titles_table_path = ./titles-hypnosis.csv

web-scraping with BeautifulSoup

Using python to scrape data from pubmed¶

Pay Notebook Creator: Salah Ahmed	0
Set Container: Numerical CPU with TINY Memory for 10 Minutes	0
Total	0