1 """
2 Interface to ISI Web of Knowledge.
3
4 """
5
6 from twisted.web import client
7 from twisted.internet import defer
8 from twisted.python import failure
9
10 from Pyblio import Compat
11
12 import urllib, sys, logging
13
14 from gettext import gettext as _
15
16 from Pyblio.Exceptions import QueryError
17 from Pyblio.Parsers.Semantic.WOK import Reader
18 from Pyblio.External.HTTP import HTTPRetrieve
19 from Pyblio.External import IExternal
20
21
23 """ Parse the result from the server, and immeditately catch
24 possible errors."""
25
26 tree = Compat.ElementTree.XML(data)
27
28 err = tree.find('./error')
29 if err is not None:
30 raise QueryError(err.text)
31
32 return tree
33
35 """ Return (number of hits, number of searched records)."""
36
37 stats = [ int(tree.findtext(f)) for f in
38 ('./searchResults/recordsFound',
39 './searchResults/recordsSearched') ]
40
41 return stats, tree.findtext('./sessionID')
42
43
44 -class WOK(IExternal):
45 """ I represent a query session on the Web of Knowledge.
46
47 The session is connected to a database whose schema is
48 'org.pybliographer/wok/...'.
49
50 """
51
52 schema = 'org.pybliographer/wok/0.1'
53
54
55
56 baseURL = "http://estipub.isiknowledge.com/esti/cgi"
57
58
59 MAX_PER_BATCH = 100
60
61 log = logging.getLogger('pyblio.external.wok')
62
64 self.reader = Reader()
65 self.db = db
66
67 self._pending = None
68 self._debug = False
69 return
70
71
73
74 assert not self._pending
75 assert 'query' in args
76
77 self._running = True
78
79 data = {
80 'databaseID': 'WOS',
81 'rspType': 'xml',
82 'method': 'searchRetrieve',
83 'firstRec': '1',
84 'numRecs': self.MAX_PER_BATCH,
85 'depth': '',
86 'editions': '',
87 'fields': '',
88 }
89
90 data.update(args)
91
92 self.log.debug('sending query %s' % repr(data))
93
94
95 for k, v in data.items():
96 if isinstance(v, unicode):
97 data[k] = v.encode('utf-8')
98
99 q = self.baseURL + '?' + urllib.urlencode(data)
100
101 self._pending = HTTPRetrieve(q, method='GET')
102
103 return self._pending.deferred
104
105
107 """ Called in any case to mark the end of a pending request to
108 the WOK server."""
109 self._pending = None
110 return data
111
112
114 """ Ask WOK for the number of results of a given query."""
115
116 d = self._query(query=query, numRecs=1, Logout='yes')
117
118 def process(tree):
119 return _r_info(tree)[0][0]
120
121 if self._debug:
122 def show(data):
123 sys.stderr.write(data)
124 return data
125 d = d.addCallback(show)
126
127 return d.addBoth(self._done).\
128 addCallback(_xml).\
129 addCallback(process)
130
131
132 - def search(self, query, maxhits=500):
133 """ Start a query on the WOK, and fill in the database with
134 the matches.
135
136 @arg query: the query, in Web of Science format
137 @type query: unicode string
138
139 @return: a deferred that will fire when the query is
140 finished.
141 """
142
143 assert not self._pending
144 assert maxhits > 0
145
146 self._first = 1
147 self._to_fetch = None
148
149
150 data = {'query': query,
151 'firstRec': self._first,
152 'numRecs': min(self.MAX_PER_BATCH, maxhits)}
153
154
155 if maxhits < self.MAX_PER_BATCH:
156 data['Logout'] = 'yes'
157
158 results = defer.Deferred()
159
160 rs = self.db.rs.new()
161 rs.name = _('Imported from Web of Knowledge')
162
163
164 def failed(failure):
165 results.errback(failure)
166
167
168
169 def received(tree):
170 stats, sessionID = _r_info(tree)
171 found, total = stats
172
173 if self._to_fetch is None:
174
175 self._to_fetch = min(found, maxhits)
176
177 self.log.debug('session %s: received batch (%d pending)' % (
178 repr(sessionID), self._to_fetch))
179
180 self.reader.parse(tree.find('./records'), self.db, rs)
181
182 parsed = len(rs)
183 missing = self._to_fetch - parsed
184
185
186 if missing <= 0:
187
188
189
190 results.callback(found)
191 return
192
193
194 data['firstRec'] = 1 + parsed
195 data['numRecs'] = min(self.MAX_PER_BATCH, missing)
196 data['SID'] = sessionID
197
198 if missing < self.MAX_PER_BATCH:
199 data['Logout'] = 'yes'
200
201 d = self._query(**data).addBoth(self._done)
202
203 d.addCallback(_xml).\
204 addCallback(received).\
205 addErrback(failed)
206 return
207
208
209 d = self._query(**data).addBoth(self._done)
210
211 d.addCallback(_xml).\
212 addCallback(received).\
213 addErrback(failed)
214
215 return results, rs
216
217
219 """ Cancel a running query. The database is not reverted to its
220 original state."""
221 if not self._pending:
222 return
223
224 self._pending.cancel()
225 self._pending = None
226 return
227