Skip to content

Commit 2b6d715

Browse files
committed
Updated scraper for new Kaggle Api >=1.7
1 parent f0069d4 commit 2b6d715

File tree

3 files changed

+29
-6
lines changed

3 files changed

+29
-6
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,5 @@ Gemfile.lock
44
*.gem
55
.jekyll-cache
66
.jekyll-cache
7+
*.sqlite
8+
kaggle_stats.json

kaggle_json.py

+26-5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from github import Github
2-
import github,requests_cache,re,os,kaggle,json,numpy as np
2+
import traceback,github,requests_cache,re,os,kaggle,json,numpy as np
33

44
GITHUB_TOKEN = os.getenv("GIST_TOKEN")
55
GIST_ID = "c9112c25c5acd400b90741efa81aa411"
@@ -81,20 +81,41 @@ def unformat_bytes(string):
8181

8282
for dsn in dataset_names:
8383
print(f'Processing {dsn}...')
84+
# Old Kaggle Api <1.7
8485
try:
8586
user = dsn.split("/")[0]
8687
dataset = vars(next((d for d in usernames[user] if vars(d)['ref'] == dsn)))
8788
downloads.append(int(dataset['downloadCount']))
8889
views.append(int(dataset['viewCount']))
8990
sizes.append(int(dataset['totalBytes']))
9091
print(f'{dsn} done.')
91-
92-
except Exception as e:
93-
print(f'{e} when reading {dsn}')
92+
93+
# New Kaggle Api >=1.7
94+
except KeyError:
95+
try:
96+
user = dsn.split("/")[0]
97+
dataset = next((d for d in usernames[user] if d.ref == dsn))
98+
downloads.append(int(dataset.download_count))
99+
views.append(int(dataset.view_count))
100+
sizes.append(int(dataset.total_bytes))
101+
print(f'{dsn} done.')
102+
103+
except Exception:
104+
traceback.print_exc()
105+
print(f'Error when reading {dsn}')
106+
print(f'Continuing with 0 values...')
107+
downloads.append(0)
108+
views.append(0)
109+
sizes.append(0)
110+
111+
except Exception:
112+
traceback.print_exc()
113+
print(f'Error when reading {dsn}')
94114
print(f'Continuing with 0 values...')
95115
downloads.append(0)
96116
views.append(0)
97117
sizes.append(0)
118+
98119

99120
views = np.array(views)
100121
downloads = np.array(downloads)
@@ -121,7 +142,7 @@ def unformat_bytes(string):
121142
'size': ds_size,
122143
'views': ds_views,
123144
'downloads': ds_downs,
124-
}
145+
}
125146
json_dump[filename] = kaggle_stats
126147
total_bytes += int(np.sum(downloads*size_in_bytes))
127148
total_size += int(np.sum(size_in_bytes))

requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
kaggle>=1.6,<1.7
1+
kaggle>=1.6
22
numpy>=1.2
33
requests-cache>=1.2.1
44
pygithub>=2.4.0

0 commit comments

Comments
 (0)