-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcitation_grabber.py
111 lines (93 loc) · 3.88 KB
/
citation_grabber.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import requests
import requests_cache
requests_cache.install_cache('citations_cache')
from bs4 import BeautifulSoup
from Bio import Entrez
def get_citations(pmcid):
articles_url = 'http://www.ncbi.nlm.nih.gov/pmc/articles/%s' % pmcid
headers = {
'User-Agent': (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/39.0.2171.95 Safari/537.36"
)
}
page = requests.get(articles_url, headers=headers)
soup = BeautifulSoup(page.content, "lxml")
pubmed_article_urls = [
span.a['href'] for span in soup.findAll("span", {"class": "nowrap ref pubmed"})
]
return [url.replace(r'/pubmed/', '') for url in pubmed_article_urls]
# If we access the DB too much they will send an email before cutting us off.
# Try to access in batches (not sure if it actually helps or not).
Entrez.email = "[email protected]"
def get_pmc_id(pubmed_id):
pubmed_record_url = (
'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'
'db=pubmed&id=%s&retmode=xml'
) % pubmed_id
headers = {
'User-Agent': (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/39.0.2171.95 Safari/537.36"
)
}
page = requests.get(pubmed_record_url, headers=headers)
soup = BeautifulSoup(page.content, "lxml")
return soup.find("articleid", {"idtype": "pmc"}).get_text()
def get_article_info(pubmed_ids):
handle = Entrez.efetch(db="pubmed", id=pubmed_ids, rettype="medline", retmode="xml")
records = Entrez.read(handle)
articles = []
for record in records:
article_info = dict()
if 'MedlineCitation' not in record:
continue
article_info['id'] = record['MedlineCitation']['PMID']
article_info['title'] = record['MedlineCitation']['Article']['ArticleTitle']
# Get the abstract. It's formatted weird in the dict
if 'Abstract' in record['MedlineCitation']['Article']:
abstract_list = record['MedlineCitation']['Article']['Abstract']['AbstractText']
article_info['abstract'] = ''.join(abstract_list)
else:
article_info['abstract'] = 'n/a'
headings = []
# Make sure it has MeSH terms before trying to get them
if 'MeshHeadingList' in record['MedlineCitation']:
for heading in record['MedlineCitation']['MeshHeadingList']:
term = heading['DescriptorName']
# If the heading is a main heading
if heading['DescriptorName'].attributes['MajorTopicYN'] == 'Y':
term += '*'
headings.append(term)
article_info['terms'] = headings
articles.append(article_info)
handle.close()
return articles
def write_to_file(f, original_id, citation_articles):
f.write('{}|c|{}\n'.format(original_id, len(citation_articles)))
for article in citation_articles:
f.write('{}|orig|{}\n'.format(article['id'], original_id))
f.write('{}|a|{}\n'.format(article['id'], article['abstract']))
f.write('{}|t|{}\n'.format(article['id'], article['title']))
terms = ""
for term in article['terms']:
terms += '{}|'.format(term)
f.write('{}|m|{}\n'.format(article['id'], terms))
def write_citations(f, pubmed_id):
try:
pmc_id = get_pmc_id(pubmed_id)
citations = get_citations(pmc_id)
citation_articles = get_article_info(citations)
write_to_file(f, pubmed_id, citation_articles)
except AttributeError as e:
print(pubmed_id, e)
except RuntimeError as e:
print(str(e))
if(str(e) == 'Supplied id parameter is empty.'): pass
else: raise e
if __name__ == '__main__':
import sys
write_citations(sys.stdout, 15064396)
#get_pmc_id(15064396)