1 """
2 An asynchronous query module to get DOI numbers given publication information.
3
4 This module connects to http://crossref.org/ and tries to resolve DOI
5 numbers given fuzzy publication informations like journal title,
6 volume, year and start page.
7 """
8
9 from twisted.web import client
10 from twisted.internet import defer
11
12 import random
13 import urllib
14
15 import logging
16
17 from Pyblio import Store, Attribute
18
19
21
22 """ Query DOI numbers.
23
24 Convenience module that properly groups queries to CrossRef in
25 order to increase throughput.
26
27 >>> cnx = DOIQuery(db, user=..., pwd=...)
28 >>> for info in to_resolve:
29 ... cnx.journalSearch(...).addCallback(got_results)
30 >>> cnx.finished()
31
32 The 'db' parameter is a database from which the queries and
33 results will be composed. It must conform to the
34 I{org.pybliographer/crossref/0.1} schema.
35
36 The actual queries take place when enough searches have been
37 requested, or when the .finished() method is called.
38
39 For each query, a list of possible DOIs is returned. It can
40 possibly be empty if the citation could not be resolved.
41
42 In case of a failure in the query protocol itself, the registered
43 errback handlers are called for each query.
44 """
45
46
47 BATCH = 30
48
49 baseURL = 'http://doi.crossref.org/servlet/query'
50
51
52 log = logging.getLogger('pyblio.external.crossref')
53
55 self.db = db
56 self.user = user
57 self.pwd = pwd
58
59 self._pending = {}
60 self._uid = 0
61 self._queue = []
62
63
64 self._batch = []
65
66 self._running = False
67
68 self._finished = None
69 self._stats = [0, 0]
70 return
71
73
74 enqueued = self._queue
75 self._queue = []
76
77 self._batch.append(enqueued)
78
79 if not self._running:
80 self._running = True
81 self._send()
82 return
83
85
86 try:
87 enqueued = self._batch.pop()
88 except IndexError:
89 self._running = False
90 return
91
92 qdata = '\n'.join([x[1] for x in enqueued]).encode('utf-8')
93
94 self.log.debug('sending a batch to the server')
95 self.log.debug(repr(qdata))
96
97 data = {
98 'usr': self.user,
99 'pwd': self.pwd,
100 'qdata': qdata,
101 }
102
103 req = client.getPage(
104 self.baseURL, method='POST',
105 headers={'Content-Type': 'application/x-www-form-urlencoded'},
106 postdata=urllib.urlencode(data))
107
108
109 def received(data):
110 self.log.debug('received a batch from the server')
111 self.log.debug(repr(data))
112
113 r = {}
114
115
116 for line in data.decode('latin-1').split('\n'):
117 line = line.strip()
118 if not line: continue
119
120 try:
121 parts = line.split('|')
122 key, doi = parts[-2:]
123
124 key = int(key)
125 doi = doi.strip()
126
127 except (IndexError, ValueError):
128 continue
129
130 if key not in self._pending:
131 raise ValueError('key %s received while not expected' % repr(key))
132
133 lp = len(parts)
134
135 if lp not in (10, 12):
136 raise ValueError('result %s has not the expected syntax' % repr(line))
137
138 if not doi:
139 self.log.debug('no DOI for key %s (%s)' % (repr(key), repr(line)))
140 continue
141
142
143 rec = Store.Record()
144 def one(field, val):
145 if val:
146 rec.add(field, val, Attribute.Text)
147 return
148
149 def person(val):
150 return Attribute.Person(last=val)
151
152 def year(val):
153 return Attribute.Date(year=int(val))
154
155 tp = self.db.schema.txo['doctype'].byname
156
157
158 rec.add('doi', doi, Attribute.ID)
159
160 if lp == 10:
161 rec.add('doctype', tp('article'), Attribute.Txo)
162 one('issn', parts[0])
163 one('title', parts[1])
164 rec.add('author', parts[2], person)
165 one('volume', parts[3])
166 one('issue', parts[4])
167 one('startpage', parts[5])
168 rec.add('year', parts[6], year)
169
170 else:
171 rec.add('doctype', tp('book'), Attribute.Txo)
172 one('isbn', parts[0])
173 one('serial', parts[1])
174 one('title', parts[1])
175 rec.add('author', parts[2], person)
176 one('volume', parts[3])
177 one('edition', parts[4])
178 one('startpage', parts[5])
179 rec.add('year', parts[6], year)
180 one('part', parts[7])
181
182 r.setdefault(key, []).append(rec)
183
184
185
186 for uid, q in enqueued:
187 self._pending[uid].callback(r.get(uid, []))
188 del self._pending[uid]
189
190 self._stats[0] += len(enqueued)
191 self._batch_done()
192 return
193
194 def failed(reason):
195 self.log.debug('too bad, the batch failed: %s' % str(reason))
196
197 for uid, q in enqueued:
198 self._pending[uid].errback(reason)
199 del self._pending[uid]
200
201 self._stats[1] += len(enqueued)
202 self._batch_done()
203 return
204
205 req.addCallback(received).addErrback(failed)
206 return
207
209 if self._finished and not self._pending:
210 self._finished.callback(self._stats)
211
212 self._send()
213 return
214
216 d = defer.Deferred()
217
218 self._pending[self._uid] = d
219 self._queue.append((self._uid, q))
220 self._uid += 1
221
222 if len(self._queue) >= self.BATCH:
223 self._make_batch()
224
225 return d
226
228 assert not self._finished, 'finished() called twice'
229 self._make_batch()
230
231 self._finished = defer.Deferred()
232 return self._finished
233
235 assert not self._finished, 'finished() already called'
236
237 t = record['doctype'][0]
238 t = self.db.schema.txo[t.group][t.id].names['C']
239
240 def one(field):
241 return record.get(field, [''])[0]
242
243 if t == 'article':
244 issn = one('issn')
245 title = one('title')
246 volume = one('volume')
247 issue = one('issue')
248 startpage = one('startpage')
249
250 try:
251 year = str(record['year'][0].year)
252 except KeyError:
253 year = ''
254
255 try:
256 author = record['author'][0].last
257 except KeyError:
258 author = ''
259
260 q = '|'.join([
261 issn, title, author, volume, issue, startpage,
262 year, 'full_text', str(self._uid), ''])
263
264 elif t == 'book':
265 isbn = one('isbn')
266 serial = one('serial')
267 title = one('title')
268 volume = one('volume')
269 edition = one('edition')
270 page = one('startpage')
271 part = one('part')
272
273 try:
274 year = str(record['year'][0].year)
275 except KeyError:
276 year = ''
277
278 try:
279 author = record['author'][0].last
280 except KeyError:
281 author = ''
282
283 q = '|'.join([
284 isbn, serial, title, author, volume, edition, page,
285 year, part, 'full_text', str(self._uid), ''])
286
287 else:
288 raise ValueError('cannot search for doctype %s' % repr(t))
289
290 return self._prepare(q)
291