-
-
Notifications
You must be signed in to change notification settings - Fork 62
/
Copy pathdataset.py
29 lines (20 loc) · 1008 Bytes
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# Sample data extraction file which generate a classification dataset using sklearn.datasets
from sklearn.datasets import make_classification
import pandas as pd
import os
def extract_data():
if not os.path.exists("data"):
os.mkdir("data")
append_mode = os.path.isfile("data/train.csv")
num_datasets = 10 if not append_mode else 1
for _ in range(num_datasets):
X, y = make_classification(n_samples=10000, n_features=10, n_informative=8, n_redundant=2, n_classes=2, random_state=42)
df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
df['target'] = y
train_data = df.iloc[:8000]
test_data = df.iloc[8000:]
train_data.to_csv("data/train.csv", mode="a", header=not append_mode, index=False)
test_data.to_csv("data/test.csv", mode="a", header=not append_mode, index=False)
print("Extracted data from source successfully")
if __name__ == "__main__":
extract_data()