-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtech_parser.py
98 lines (79 loc) · 3.2 KB
/
tech_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import json
import csv
import os
import pandas as pd
def read_base_domains(filename):
"""Reads the base domains from a file."""
with open(filename, 'r') as file:
return [line.strip() for line in file]
def remove_url_parameters(url):
if "?" not in url:
return url
return url.split('?')[0]
def get_cms_data(filename):
try:
with open(filename, 'r') as file:
data = json.load(file)
except Exception as e:
print(f"Error reading JSON from {filename}: {e}")
return ["File Error", "File Error", "File Error"]
url = ''
cms_name = ''
cms_version = ''
if not data:
return ["0", "0", "0"]
for url_data, status in data["urls"].items():
url_data = remove_url_parameters(url_data)
if status['status'] == 200:
url = url_data
break
elif status['status'] == 0:
url = url_data
return [url, "0", "0"]
if url:
for technology in data['technologies']:
for category in technology['categories']:
if category['name'] == 'CMS':
cms_name = technology['name']
cms_version = technology['version']
break
if not cms_name:
cms_name = 'Not Detected'
cms_version = '0'
if not cms_version:
cms_version = '0'
return [url, cms_name, cms_version]
def load_tranco_data(filename):
df = pd.read_csv(filename, header=None, names=['TrancoRank', 'Domain'])
tranco_series = pd.Series(df.TrancoRank.values, index=df.Domain)
return tranco_series
def find_tranco_rank(full_domain, tranco_series):
return tranco_series.get(full_domain, '0')
def create_csv(base_domains_file, folder):
base_domains = read_base_domains(base_domains_file)
tranco_series = load_tranco_data('tranco_Z25PG.csv')
domains_data = {}
for filename in os.scandir(folder):
if filename.is_file():
for base_domain in base_domains:
if filename.name.endswith(f"{base_domain}.json"):
full_domain = filename.name.split('.json')[0][5:] # Removes 'tech_' prefix
domains_data.setdefault(base_domain, []).append((full_domain, filename.name))
with open('domains.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(['BaseDomain', 'FullDomain', 'URL', 'CMS', 'Version', 'TrancoRank'])
for base_domain, files in domains_data.items():
for full_domain, filename in files:
full_path = os.path.join(folder, filename)
if os.path.exists(full_path):
cms_data = get_cms_data(full_path)
tranco_rank = find_tranco_rank(full_domain, tranco_series)
writer.writerow([base_domain, full_domain] + cms_data + [tranco_rank])
else:
writer.writerow([base_domain, full_domain, 'File Not Found', 'File Not Found'])
# File containing the base domains
base_domains_file = "uni_list.txt"
# Folder containing the technology files
folder = "technology"
# Create the CSV
create_csv(base_domains_file, folder)