-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
52 lines (38 loc) · 2.04 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import re
import requests
from bs4 import BeautifulSoup
r = requests.get("http://www.gutenberg.org/ebooks/search/?query=charles+dickens")
html = r.content
soup = BeautifulSoup(html, 'html.parser')
#print(soup.prettify())
items_per_page = soup.find("meta", {"name": "itemsPerPage"})
items = int(items_per_page['content'])
check_next = soup.select("[title^=Nex]")
booklist = []
i = 0
while soup.select("[title^=Nex]") != []:
review_titles_list = soup.find_all("li", {"class": "booklink"})
for block in review_titles_list:
titles = block.find_all("span", {"class": "cell content"})
for title in titles:
if title.find("span", {"class": "subtitle"}) != None and re.match('.*?Charles Dickens.*',(title.find("span", {"class": "subtitle"})).get_text()):
author = (title.find("span", {"class": "subtitle"})).get_text()
booklist.append(str(i) + ". " + title.find("span", {"class": "title"}).get_text() + " by " + author)
i += 1
get_start_index = soup.find("meta", {"name": "startIndex"})
start_index = int(get_start_index['content']) + items
r = requests.get("http://www.gutenberg.org/ebooks/search/?start_index="+str(start_index)+"&query=charles+dickens")
html = r.content
soup = BeautifulSoup(html, 'html.parser')
r = requests.get("http://www.gutenberg.org/ebooks/search/?start_index="+str(start_index)+"&query=charles+dickens")
html = r.content
soup = BeautifulSoup(html, 'html.parser')
review_titles_list = soup.find_all("li", {"class": "booklink"})
for block in review_titles_list:
titles = block.find_all("span", {"class": "cell content"})
for title in titles:
if title.find("span", {"class": "subtitle"}) != None and re.match('.*?Charles Dickens.*', (title.find("span", {"class": "subtitle"})).get_text()):
author = (title.find("span", {"class": "subtitle"})).get_text()
booklist.append(str(i) + ". " + title.find("span", {"class": "title"}).get_text() + " by " + author)
i += 1
print('[%s]'%'\n'.join(map(str,booklist)))