Package Pyblio :: Package External :: Module PubMed
[hide private]
[frames] | no frames]

Source Code for Module Pyblio.External.PubMed

  1  # This file is part of pybliographer 
  2  #  
  3  # Copyright (C) 1998-2006 Frederic GOBRY 
  4  # Email : gobry@pybliographer.org 
  5  #           
  6  # This program is free software; you can redistribute it and/or 
  7  # modify it under the terms of the GNU General Public License 
  8  # as published by the Free Software Foundation; either version 2  
  9  # of the License, or (at your option) any later version. 
 10  #    
 11  # This program is distributed in the hope that it will be useful, 
 12  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 13  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 14  # GNU General Public License for more details.  
 15  #  
 16  # You should have received a copy of the GNU General Public License 
 17  # along with this program; if not, write to the Free Software 
 18  # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA. 
 19  #  
 20   
 21  """ 
 22  Programmatic access to the PubMed database 
 23  """ 
 24   
 25  # Documentation taken from: 
 26  #   http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html 
 27  # 
 28   
 29  import logging, urllib 
 30  import datetime 
 31   
 32  from gettext import gettext as _ 
 33   
 34  from Pyblio import Compat 
 35   
 36  from twisted.web import client 
 37  from twisted.internet import defer, reactor 
 38   
 39  from Pyblio.Exceptions import QueryError 
 40  from Pyblio.External.HTTP import HTTPRetrieve 
 41  from Pyblio.Parsers.Semantic.PubMed import Reader 
 42   
 43   
44 -def _xml(data):
45 """ Parse the result from the server, and immeditately catch 46 possible errors.""" 47 tree = Compat.ElementTree.XML(data) 48 49 err = tree.find('./ERROR') 50 if err is not None: 51 raise QueryError(err.text) 52 53 return tree
54
55 -class QueryHelper(object):
56 57 query_fields = { 58 'ALL': _('All Fields'), 59 'AD': _('Affiliation'), 60 'AU': _('Author Name'), 61 'RN': _('EC/RN Number'), 62 'EDAT': _('Entrez Date'), 63 'FILTER': _('Filter'), 64 'IP': _('Issue'), 65 'TA': _('Journal Title'), 66 'LA': _('Language'), 67 'MHDA': _('MeSH Date'), 68 'MAJR': _('MeSH Major Topic'), 69 'SH': _('MeSH Subheading'), 70 'MH': _('MeSH Terms'), 71 'PG': _('Pagination'), 72 'DP': _('Publication Date'), 73 'PT': _('Publication Type'), 74 'SI': _('Secondary Source ID'), 75 'NM': _('Substance Name'), 76 'TW': _('Text Word'), 77 'TI': _('Title'), 78 'TIAB': _('Title/Abstract'), 79 'PMID': _('UID'), 80 'VI': _('Volume'), 81 } 82 publication_types = { 83 'addresses': _('Addresses'), 84 'bibliography': _('Bibliography'), 85 'biography': _('Biography'), 86 'classical article': _('Classical Article'), 87 'clinical conference': _('Clinical Conference'), 88 'clinical trial': _('Clinical Trial'), 89 'clinical trial, phase I': _('Clinical Trial, Phase I'), 90 'clinical trial, phase II': _('Clinical Trial, Phase II'), 91 'clinical trial, phase III': _('Clinical Trial, Phase III'), 92 'clinical trial, phase IV': _('Clinical Trial, Phase IV'), 93 'comment': _('Comment'), 94 'congresses': _('Congresses'), 95 'consensus development conference': _('Consensus Development Conference'), 96 'consensus development conference, NIH': _('Consensus Development Conference, NIH'), 97 'controlled clinical trial': _('Controlled Clinical Trial'), 98 'corrected and republished article': _('Corrected and Republished Article'), 99 'dictionary': _('Dictionary'), 100 'directory': _('Directory'), 101 'duplicate publication': _('Duplicate Publication'), 102 'editorial': _('Editorial'), 103 'evaluation studies': _('Evaluation Studies'), 104 'festschrift': _('Festschrift'), 105 'government publications': _('Government Publications'), 106 'guideline': _('Guideline'), 107 'historical article': _('Historical Article'), 108 'interview': _('Interview'), 109 'journal article': _('Journal Article'), 110 'lectures': _('Lectures'), 111 'legal cases': _('Legal Cases'), 112 'legislation': _('Legislation'), 113 'letter': _('Letter'), 114 'meta-analysis': _('Meta-Analysis'), 115 'multicenter study': _('Multicenter Study'), 116 'news': _('News'), 117 'newspaper article': _('Newspaper Article'), 118 'overall': _('Overall'), 119 'periodical index': _('Periodical Index'), 120 'practice guideline': _('Practice Guideline'), 121 'randomized controlled trial': _('Randomized Controlled Trial'), 122 'retraction of publication': _('Retraction of Publication'), 123 'retracted publication': _('Retracted Publication'), 124 'review': _('Review'), 125 'review, academic': _('Review, Academic'), 126 'review, literature': _('Review Literature'), 127 'review, multicase': _('Review, Multicase'), 128 'review of reported cases': _('Review of Reported Cases'), 129 'review, tutorial': _('Review, Tutorial'), 130 'scientific integrity review': _('Scientific Integrity Review'), 131 'technical report': _('Technical Report'), 132 'twin study': _('Twin Study'), 133 'validation studies': _('Validation Studies'), 134 } 135 136 language = { 137 'english': _('English'), 138 'french': _('French'), 139 'german': _('German'), 140 'italian': _('Italian'), 141 'japanese': _('Japanese'), 142 'russian': _('Russian'), 143 'spanish': _('Spanish'), 144 } 145 146 age_range = [ 147 ('infant', _('All Infant (birth-23 month)')), 148 ('child', _('All Child (0-18 years)')), 149 ('adult', _('All Adult (19+ years)')), 150 ('infant, newborn', _('Newborn (birth-1 month)')), 151 ('infant', _('Infant (1-23 months)')), 152 ('child, preschool', _('Preschool Child (2-5 years)')), 153 ('child', _('Child (6-12 years)')), 154 ('adolescence', _('Adolescent (13-18 years)')), 155 ('adult', _('Adult (19-44 years)')), 156 ('middle age', _('Middle Aged (45-64 years)')), 157 ('aged', _('Aged (65+ years)')), 158 ('aged, 80 and over', _('80 and over')), 159 ] 160 161 human_animal = { 162 'human': _('Human'), 163 'animal': _('Animal'), 164 } 165 166 gender = { 167 'female': _('Female'), 168 'male': _('Male'), 169 } 170 171 subset = { 172 'bioethics[ab]': _('Bioethics'), 173 174 'jsubsetaim': _('Core clinical journals'), #AIM - Abridged Index Medicus A list of core clinical journals created 20 years ago 175 'jsubsetb': _('Biotechnology journals'), #B - biotechnology journals (assigned 1990 - 1998), non-Index Medicus 176 'jusbsetc': _('Communication disorders journals'), #C - communication disorders journals (assigned 1977 - 1997), non-Index Medicus 177 'jsubsetd': _('Dental journals'), #D - dentistry journals 178 'jsubsete': _('Bioethics journals'), #E - bioethics journals, non-Index Medicus 179 'jsubseth': _('Health administration journals'), #H - health administration journals, non-Index Medicus 180 'jsubsetim': _('Index Medicus journals'), #IM - Index Medicus journals 181 'jsubsetk': _('Consumer health journals'), #K - consumer health journals, non-Index Medicus 182 'jsubsetn': _('Nursing journals'), #N - nursing journals 183 'jsubsetq': _('History of Medicine journals'), #Q - history of medicine journals, non-Index Medicus 184 'jsubsetr': _('Reproduction journals'), #R - reproduction journals (assigned 1972 - 1979), non-Index Medicus 185 'jsubsets': _('NASA journals'), #S - National Aeronautics and Space Administration (NASA) journals, non-Index Medicus 186 'jsubsett': _('Health tech assesment journals'), #T - health technology assessment journals, non-Index Medicus 187 'jsubsetx': _('AIDS/HIV journals'), #X - AIDS/HIV journals, non-Index Medicus 188 189 'aids[sb]': _('AIDS'), 190 'cam[sb]': _('Complementary and Alternative Medicine'), 191 'history[sb]': _('History of Medicine'), 192 'in process[sb]': _('In process'), 193 'medline[sb]': _('MEDLINE'), 194 'medline pmc[sb]': _('PubMed Central'), 195 'space[sb]': _('Space Life Sciences'), 196 'publisher[sb]': _('Supplied by Publisher'), 197 'tox[sb]': _('Toxicology'), 198 } 199
200 - def makeQuery(self, field='ALL', keyword=None, abstract=False, 201 epubahead=False, publication_type=None, 202 language=None, subset=None, age_range=None, 203 human_animal=None, gender=None, 204 use_publication_date=False, from_date=None, 205 to_date=None):
206 207 """Compose an advanced query. 208 209 'field' is a single value from self.query_fields. 210 'publication_type' is a single value from self.publication_types, or None. 211 'language' is from self.language or None 212 'subset' is from self.subset or None 213 'age_range' is from self.age_range or None 214 'human_animal' is from self.human_animal or None 215 'gender' is from self.gender or None 216 217 If use_publication_date is True, select publications whose 218 publication date is between from_date and to_date, otherwise 219 use the entrez date. 220 221 Args: 222 field: string 223 keyword: string 224 abstract: bool 225 epubahead: bool 226 publication_type: string or None 227 language: string or None 228 subset: string or None 229 age_range: string or None 230 human_animal: string or None 231 gender: string or None 232 pubdate: bool 233 from_date: datetime.date() or None 234 to_date: datetime.date() or None 235 """ 236 237 parts = [] 238 if keyword is not None: 239 parts.append(keyword + '[%s]' % field) 240 if abstract: 241 parts.append('hasabstract') 242 if epubahead: 243 parts.append('pubstatusaheadofprint') 244 if publication_type: 245 parts.append(pubtype + '[pt]') 246 if language: 247 parts.append(language + '[la]') 248 if subset: 249 parts.append(subset) 250 if age_range: 251 parts.append(age_range + '[mh]') 252 if human_animal: 253 parts.append(human_animal + '[mh]') 254 if gender: 255 parts.append(gender + '[mh]') 256 257 if from_date: 258 if not to_date: 259 to_date = datetime.date.today() 260 date = ':'.join([from_date.strftime('%Y/%m/%d'), 261 to_date.strftime('%Y/%m/%d')]) 262 263 if use_publication_date: 264 date += '[dp]' 265 else: 266 date += '[edat]' 267 parts.append(date) 268 269 keywords = ' AND '.join(parts) 270 271 return keywords
272
273 -class PubMed(object):
274 """ A connection to the PubMed database """ 275 276 schema = 'org.pybliographer/pubmed/0.1' 277 278 baseURL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils' 279 280 BATCH_SIZE = 500 281 282 toolName = 'pybliographer' 283 adminEmail = 'webmaster@pybliographer.org' 284 285 log = logging.getLogger('pyblio.external.pubmed') 286 287 SRV_SEARCH = '/esearch.fcgi' 288 SRV_FETCH = '/efetch.fcgi' 289
290 - def __init__(self, db):
291 292 self.db = db 293 self._pending = None 294 self.reader = Reader() 295 296 return
297
298 - def _query(self, service, args, **kargs):
299 300 all = {'email': self.adminEmail, 301 'tool': self.toolName, 302 'retmode': 'xml'} 303 304 all.update(args) 305 all.update(kargs) 306 307 # ensure all arguments are utf8 encoded 308 for k, v in all.items(): 309 if isinstance(v, unicode): 310 all[k] = v.encode('utf-8') 311 312 url = self.baseURL + service + '?' + urllib.urlencode(all) 313 314 self.log.debug('sending query %r' % url) 315 316 # We have the charge of setting and cleaning self._pending 317 self._pending = HTTPRetrieve(url) 318 319 def done(data): 320 self._pending = None 321 return data
322 323 return self._pending.deferred.addBoth(done)
324 325
326 - def count(self, query, db='PubMed'):
327 328 assert self._pending is None, 'no more than one search at a time per connection' 329 330 data = {'db': db, 331 'term': query} 332 333 req = self._query(self.SRV_SEARCH, data, rettype='count') 334 335 def success(data): 336 return int(data.find('./Count').text)
337 338 return req.addCallback(_xml).addCallback(success) 339 340
341 - def search(self, query, maxhits=500, db='PubMed'):
342 343 assert self._pending is None, 'no more than one search at a time per connection' 344 345 query = query.strip() 346 347 data = {'db': db, 348 'term': query} 349 350 req = self._query(self.SRV_SEARCH, data, usehistory='y') 351 352 # The deferred for the global result 353 results = defer.Deferred() 354 355 # The result set that will contain the data 356 rs = self.db.rs.new() 357 rs.name = _('Imported from PubMed') 358 359 # Special case for no query: this would cause an error from 360 # the server if we do not catch it first. 361 if not query: 362 def autofire(): 363 results.callback(0)
364 reactor.callLater(0, autofire) 365 return results, rs 366 367 stats = {} 368 369 def failed(reason): 370 results.errback(reason) 371 372 def got_summary(data): 373 # Total number of results 374 all_results = int(data.find('./Count').text) 375 376 # Parameters necessary to fetch the content of the result set 377 fetchdata = { 378 'db': db, 379 'WebEnv': data.find('./WebEnv').text, 380 'query_key': data.find('./QueryKey').text, 381 } 382 383 stats['missing'] = min(all_results, maxhits) 384 385 self.log.info('%d results, retrieving %d' % ( 386 all_results, stats['missing'])) 387 388 def fetch(data): 389 # data is None during the initial call to the method, 390 # so that we can reuse the same code. 391 if data is not None: 392 # Process the incoming XML data 393 previously = len(rs) 394 self.reader.parse(data, self.db, rs) 395 freshly_parsed = len(rs) - previously 396 if freshly_parsed <= 0: 397 self.log.warn("what happend? I increased the result set by %d" % freshly_parsed) 398 # pretend there has been at least one parsing, so 399 # that we ensure that the task 400 # progresses. Otherwise we might loop forever on 401 # an entry we cannot parse. 402 freshly_parsed = 1 403 404 stats['missing'] -= freshly_parsed 405 406 if stats['missing'] <= 0: 407 self.log.info('finished') 408 results.callback(all_results) 409 return 410 411 # No need to fetch 500 results if only 20 are requested 412 batch = min(self.BATCH_SIZE, stats['missing']) 413 self.log.info('retrieving next %d' % batch) 414 415 d = self._query(self.SRV_FETCH, fetchdata, 416 retstart=len(rs), retmax=batch) 417 418 d.addCallback(_xml).\ 419 addCallback(fetch).\ 420 addErrback(failed) 421 return 422 423 # Bootstrap the fetching process 424 fetch(None) 425 426 req.addCallback(_xml).\ 427 addCallback(got_summary).\ 428 addErrback(failed) 429 430 return results, rs 431 432
433 - def cancel(self):
434 """ Cancel a running query. The database is not reverted to its 435 original state.""" 436 if not self._pending: 437 return 438 439 self._pending.cancel() 440 return
441