1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 """
22 Citeseer (http://citeseer.ist.psu.edu/) queries
23 """
24
25
26
27
28
29
30
31 import urllib
32 import logging
33 import BeautifulSoup
34 import re
35 import StringIO
36
37 from gettext import gettext as _
38 from twisted.internet import defer, reactor
39
40 from Pyblio import Attribute
41 from Pyblio.External import IExternal
42 from Pyblio.External.HTTP import HTTPRetrieve
43 from Pyblio.Exceptions import QueryError, ParserError
44 from Pyblio.Parsers.Semantic import BibTeX
45
46 log = logging.getLogger('pyblio.external.citeseer')
47
48 whitespace = re.compile(r'[\s\n]+', re.M)
49
51 """Parse a Citeseer result page containing links to the actual
52 detailed citations."""
53
54 results = re.compile(r'(\d+|No)\s+documents?\s+found')
56 self.soup = BeautifulSoup.BeautifulSoup(page)
57 self.rls = self.soup.findAll(
58 text=lambda text: isinstance(text, BeautifulSoup.Comment) and \
59 text == 'RLS')[0]
60 self.ris = self.soup.findAll(
61 text=lambda text: isinstance(text, BeautifulSoup.Comment) and \
62 text == 'RIS')
63
65 """Return the overall result count."""
66
67
68
69 if not self.ris:
70 return 0
71 current = self.rls.previous
72 while current is not None:
73 if current.string is not None:
74 m = self.results.search(current.string)
75 if m:
76 return int(m.group(1))
77 current = current.previous
78 raise QueryError(_("cannot parse result page"))
79
81 """Return the result links."""
82 return [str(ris.findNext('a')['href']) for ris in self.ris]
83
84
92
94 """Parse a detailed citation page, containing an abstract and a
95 BibTeX snippet."""
96
98 self.soup = BeautifulSoup.BeautifulSoup(page)
99
101 content = {'bibtex': self.soup.pre.string}
102 abstract = self.soup.findAll(text='Abstract:')
103 if abstract:
104 abstract = abstract[0].parent.nextSibling.strip()
105 content['abstract'] = whitespace.sub(' ', abstract)
106 return content
107
108
110 """A connection to Citeseer."""
111
112 schema = 'org.pybliographer/bibtex/0.1'
113
114 BATCH_SIZE = 50
115 FETCHER_POOL = 2
116
117 MIRRORS = ['http://citeseer.ist.psu.edu/cis',
118 'http://citeseer.ittc.ku.edu/cs']
119
120 baseURL = MIRRORS[1]
121
123 self.db = db
124 self._pending = None
125 self._reader = RelaxedBibTeX('utf-8')
126
127 - def _query(self, query, start=0):
128 assert self._pending is None, \
129 'no more than one search at a time per connection'
130
131 qb = {'dbnum': 1,
132 'start': start,
133 'am': self.BATCH_SIZE,
134 'ao': 'Citations',
135 'af': 'Any',
136 'qtype': 'document:'}
137 all = {'q': query,
138 'qb': ','.join('%s=%s' % v for v in qb.iteritems())}
139
140 for k, v in all.items():
141 if isinstance(v, unicode):
142 all[k] = v.encode('utf-8')
143 url = self.baseURL +'?' + urllib.urlencode(all)
144
145 log.info('sending query %r' % url)
146 self._pending = HTTPRetrieve(url)
147
148 def done(data):
149 self._pending = None
150 return data
151 def parse(data):
152 return ResultScraper(data)
153 return self._pending.deferred.\
154 addBoth(done).\
155 addCallback(parse)
156
163 def got_summary(data):
164 results.callback(data.count())
165 req.addCallback(got_summary).addErrback(failed)
166 return results
167
168 - def search(self, query, maxhits=100):
169 rs = self.db.rs.new()
170 rs.name = _('Imported from Citeseer')
171
172 req = self._query(query)
173 results = defer.Deferred()
174
175 self._abort = False
176
177 def failed(reason):
178 results.errback(reason)
179
180 def got_page(data, link):
181 """Handle a detailed citation page."""
182 if data:
183 log.info('obtained page %r' % link)
184 citation = data.citation()
185 if not citation['bibtex']:
186 log.warn('page has no bibtex field?')
187 else:
188 fd = StringIO.StringIO(citation['bibtex'].encode('utf-8'))
189 try:
190 obtained = self._reader.parse(fd, self.db)
191 except ParserError, msg:
192 log.error('unable to parse %r: %s' % (
193 citation['bibtex'], msg))
194 obtained = []
195 for key in obtained:
196
197 if 'abstract' in citation:
198 record = self.db[key]
199 record.add('abstract',
200 citation['abstract'],
201 Attribute.Text)
202 self.db[key] = record
203 rs.add(key)
204 if self._links and not self._abort:
205
206
207 link = self._links.pop()
208 fetcher = HTTPRetrieve(link)
209 log.info('fetching detailed page %r' % link)
210 self._running.append(link)
211 def done(data):
212 self._running.remove(link)
213 return data
214 def parse_citation(data):
215 return CitationScraper(data)
216 def inner_failure(data):
217 if not self._running:
218 results.errback(data)
219 self._abort = data
220 fetcher.deferred.\
221 addBoth(done).\
222 addCallback(parse_citation).\
223 addCallback(got_page, link).\
224 addErrback(inner_failure)
225 elif not self._running:
226
227
228 if not self._abort or self._abort is True:
229 results.callback(self._total)
230 else:
231 results.errback(self._abort)
232
233 def got_summary(data):
234 """Handle a result page."""
235
236 self._total = data.count()
237 self._target = min(maxhits, self._total)
238 self._current = 0
239 log.info('%d results for the query' % self._total)
240 self._links = set()
241
242 def got_links(data):
243 current = data.links()
244 previous = len(self._links)
245 self._links.update(current)
246 obtained = len(self._links) - previous
247 if obtained == 0:
248 log.warn('this batch did not provide new links, stopping')
249 self._current += self.BATCH_SIZE
250 log.info('%d links in this batch (%s/%d)' % (
251 len(current), len(self._links), self._total))
252 missing = self._target - len(self._links)
253 if missing > 0 and obtained > 0:
254 log.info('getting batch at %d, %d missing' % (
255 self._current, missing))
256 next = self._query(query, self._current)
257 next.addCallback(got_links).addErrback(failed)
258 else:
259
260 self._running = []
261 for i in xrange(self.FETCHER_POOL):
262 got_page(None, None)
263 got_links(data)
264 req.addCallback(got_summary).addErrback(failed)
265 return results, rs
266
268 self._abort = True
269 if self._pending:
270 self._pending.cancel()
271