-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathfetchWikipage.py
111 lines (93 loc) · 5.09 KB
/
fetchWikipage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
'''
:: Process ::
1. Gets the URL from textPreprocessing.py
2. Fetches the HTML using URL and cleans the HTML page (removes unwanted code blocks)
3. Using html2text it converts the cleaned HTML to text.
4. Returns the text to textPreprocessing.py
'''
import requests
from bs4 import BeautifulSoup
import html2text
import sys
class FetchWikipage:
def __init__(self):
self.text_maker = html2text.HTML2Text()
self.define_html2text_parms() # init html2text params
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"
}
def define_html2text_parms(self) -> None:
self.text_maker.ignore_links = True
self.text_maker.ignore_images = True
self.text_maker.escape_all = True #Escape all special characters. Output is less readable, but avoids corner case formatting issues.
self.text_maker.unicode_snob = True # Use unicode throughout instead of ASCII
self.text_maker.single_line_break = True
self.text_maker.google_list_indent = 0
self.text_maker.body_width = 0 # 0 for no wrap
# self.text_maker.wrap_list_items = True # Wrap list items during text wrapping.
self.text_maker.no_automatic_links = True #Do not use automatic links like https://www.google.com/
# risky to remove them ?
self.text_maker.ignore_emphasis = True
def generateHTML2text(self, url:str=None) -> str:
try:
if(url is None or url == ""):
raise Exception('\033[91m'+"[URL ERR] URL is Missing !!"+'\033[0m')
res = requests.get(url, headers=self.headers)
if res.status_code != 200:
raise Exception('\033[91m'+"[URL ERR] Status = {} !!".format(res.status_code)+'\033[0m')
except Exception as e:
print('\033[91m'+"[ERR] Got some error with the URL '{}' !! (Exiting the current execution)".format(url)+'\033[0m')
# print('\033[93m'+e+'\033[0m')
sys.exit()
print("[INFO] Starting with '{:s}' URL".format(url))
html_page = res.content
soup = BeautifulSoup(html_page, 'html5lib')
# TODO: If the heading is imp. then dont remove h1, h2, h3, h4, h5, h6
# "Further reading", "Publications", "See also", "Others", "Works cited" = class(div-col columns column-width)
# Remove unwanted blocks // HTML preprocessing.
html_tags = ['script', 'style', 'footer', 'img', 'noscript', 'svg', 'link', 'audio', 'video',
'code', 'figcaption', 'figure', 'iframe', 'map', 'picture', 'h1', 'h2', 'h3', 'h4',
'h5', 'h6', 'abbr', 'blockquote']
div_class_list = ["reflist", "hatnote", "mw-indicators", "authority-control", "thumbinner",
"noprint", "navbox", "div-col columns column-width", "printfooter",
"sistersitebox", "metadata"]
div_id_list = ["mw-navigation", "toc", "siteSub", "siteNotice", "catlinks", "mwe-popups-svg",
"mw-data-after-content"]
for remove_block in soup.find_all(html_tags):
remove_block.decompose()
for remove_block in soup.find_all('div', id=div_id_list):
remove_block.decompose()
for remove_block in soup.find_all('div', {"class":div_class_list}):
remove_block.decompose()
for remove_block in soup.find_all('a', {"class":["mw-jump-link"]}):
remove_block.decompose()
# "biography" is the main table which contains the imp data and "wikitable" holds some notable data.
for remove_block in soup.find_all('table', {"class":["sistersitebox", "nowraplinks", "biography", "metadata", "wikitable"]}):
remove_block.decompose()
# remove sup having reference class
for remove_block in soup.find_all('sup', {"class":["reference"]}):
remove_block.decompose()
# remove li having external links at the bottom of the page
for remove_block in soup.find_all('li'):
for remove_ele in remove_block.find_all('a', {"class":["external text"]}):
remove_block.find_parent().decompose()
break
# remove li having cite tag
for remove_block in soup.find_all('li'):
for ele in remove_block.contents:
if(ele.name == 'cite'):
remove_block.find_parent().decompose()
break
for remove_block in soup.find_all('p'):
if(remove_block.contents[0].string in ["Footnotes", "Citations"]):
remove_block.decompose()
# print(soup.body)
# with open("./textual_data/test.html", 'w', encoding='utf8') as f:
# f.write(str(soup.body))
content = str(soup.body)
text = self.text_maker.handle(content) # html to textual form
return text
if __name__ == "__main__":
page_obj = FetchWikipage()
print(page_obj.generateHTML2text("https://en.wikipedia.org/wiki/Albert_Einstein")) # => html2txt data