From d411c7745867961774bd60446f59408f7e77d2ef Mon Sep 17 00:00:00 2001 From: Arun M <121847615+arumachu@users.noreply.github.com> Date: Wed, 18 Sep 2024 17:50:32 +0530 Subject: [PATCH] Add function to download dataset to a specific location. --- 00_core.ipynb | 151 ++++++++++++++++++++++++++++++++++++++---- fastkaggle/_modidx.py | 60 +++++------------ fastkaggle/core.py | 62 +++++++++++++---- 3 files changed, 204 insertions(+), 69 deletions(-) diff --git a/00_core.ipynb b/00_core.ipynb index ce8e998..8713a77 100644 --- a/00_core.ipynb +++ b/00_core.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -43,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -53,7 +53,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -72,16 +72,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(#20) [contradictory-my-dear-watson,gan-getting-started,store-sales-time-series-forecasting,tpu-getting-started,digit-recognizer,titanic,house-prices-advanced-regression-techniques,connectx,nlp-getting-started,spaceship-titanic...]" + "(#16) [https://www.kaggle.com/competitions/arc-prize-2024,https://www.kaggle.com/competitions/eedi-mining-misconceptions-in-mathematics,https://www.kaggle.com/competitions/rsna-2024-lumbar-spine-degenerative-classification,https://www.kaggle.com/competitions/ariel-data-challenge-2024,https://www.kaggle.com/competitions/um-game-playing-strength-of-mcts-variants,https://www.kaggle.com/competitions/playground-series-s4e9,https://www.kaggle.com/competitions/titanic,https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques,https://www.kaggle.com/competitions/spaceship-titanic,https://www.kaggle.com/competitions/digit-recognizer...]" ] }, - "execution_count": null, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -93,7 +93,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -116,16 +116,44 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading titanic.zip to C:\\Users\\iarun\\OneDrive\\Desktop\\Skills\\01_Computer_Sci\\03_AI\\10_Kaggle\\fastkaggle\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 34.1k/34.1k [00:00<00:00, 237kB/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, { "data": { "text/plain": [ "Path('titanic')" ] }, - "execution_count": null, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -134,6 +162,93 @@ "setup_comp('titanic')" ] }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "#|export\n", + "def setup_comp_directory(competition, path_to_download, install=''):\n", + " \"\"\"\n", + " Setup the environment for a Kaggle competition by downloading competition data and optionally installing packages.\n", + " \n", + " Inputs:\n", + " - competition (str): The name of the Kaggle competition (e.g., 'titanic'). This is used to fetch the relevant dataset.\n", + " - path_to_download (str): The directory path where competition data should be downloaded. \n", + " The function will create the directory if it does not already exist.\n", + " - install (str, optional): Optional package to install via pip (e.g., 'fastai') if running in a Kaggle environment. Default is '' (no installation).\n", + "\n", + " Outputs:\n", + " - path (Path object): The directory path where the competition data is located. \n", + " This will be the directory containing the unzipped data files.\n", + " \"\"\"\n", + " # Specify the desired directory to download files\n", + " custom_dir = Path(path_to_download) # Replace with your desired path\n", + " custom_dir.mkdir(parents=True, exist_ok=True) # Create the directory if it doesn't exist\n", + " \"Get a path to data for `competition`, downloading it if needed\"\n", + " if iskaggle:\n", + " if install:\n", + " os.system(f'pip install -Uqq {install}')\n", + " return Path('../input')/competition\n", + " else:\n", + " path = custom_dir / Path(competition)\n", + " api = import_kaggle()\n", + " if not path.exists():\n", + " import zipfile\n", + " api.competition_download_cli(competition, path=str(custom_dir)) # Download to custom directory\n", + " zipfile.ZipFile(f'{custom_dir}/{competition}.zip').extractall(path) # Extract files in custom directory\n", + " return path" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading titanic.zip to New_Folder\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 34.1k/34.1k [00:00<00:00, 223kB/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "Path('New_Folder/titanic')" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "setup_comp_directory('titanic','New_Folder')" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -520,7 +635,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -543,6 +658,18 @@ "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.19" } }, "nbformat": 4, diff --git a/fastkaggle/_modidx.py b/fastkaggle/_modidx.py index 077d965..1bd645a 100644 --- a/fastkaggle/_modidx.py +++ b/fastkaggle/_modidx.py @@ -1,48 +1,22 @@ # Autogenerated by nbdev -d = { 'settings': { 'allowed_cell_metadata_keys': '', - 'allowed_metadata_keys': '', - 'audience': 'Developers', - 'author': 'Jeremy Howard', - 'author_email': 'info@fast.ai', - 'black_formatting': 'False', - 'branch': 'master', - 'clean_ids': 'True', - 'copyright': 'Jeremy Howard, 2022 onwards', - 'custom_sidebar': 'False', - 'description': 'Kaggling for fast kagglers!', +d = { 'settings': { 'branch': 'master', 'doc_baseurl': '/fastkaggle/', 'doc_host': 'https://fastai.github.io', - 'doc_path': 'docs', 'git_url': 'https://github.com/fastai/fastkaggle/tree/master/', - 'host': 'github', - 'jupyter_hooks': 'True', - 'keywords': 'machine-learning kaggle fastai nbdev', - 'language': 'English', - 'lib_name': 'fastkaggle', - 'lib_path': 'fastkaggle', - 'license': 'apache2', - 'min_python': '3.7', - 'nbs_path': '.', - 'readme_nb': 'index.ipynb', - 'recursive': 'False', - 'requirements': 'fastcore>=1.4.5 kaggle', - 'status': '2', - 'title': 'fastkaggle', - 'tst_flags': 'notest', - 'user': 'fastai', - 'version': '0.0.8'}, - 'syms': { 'fastkaggle.core': { 'fastkaggle.core.check_ds_exists': 'https://fastai.github.io/fastkaggle/core.html#check_ds_exists', - 'fastkaggle.core.create_libs_datasets': 'https://fastai.github.io/fastkaggle/core.html#create_libs_datasets', - 'fastkaggle.core.create_requirements_dataset': 'https://fastai.github.io/fastkaggle/core.html#create_requirements_dataset', - 'fastkaggle.core.get_dataset': 'https://fastai.github.io/fastkaggle/core.html#get_dataset', - 'fastkaggle.core.get_local_ds_ver': 'https://fastai.github.io/fastkaggle/core.html#get_local_ds_ver', - 'fastkaggle.core.get_pip_libraries': 'https://fastai.github.io/fastkaggle/core.html#get_pip_libraries', - 'fastkaggle.core.get_pip_library': 'https://fastai.github.io/fastkaggle/core.html#get_pip_library', - 'fastkaggle.core.import_kaggle': 'https://fastai.github.io/fastkaggle/core.html#import_kaggle', - 'fastkaggle.core.iskaggle': 'https://fastai.github.io/fastkaggle/core.html#iskaggle', - 'fastkaggle.core.mk_dataset': 'https://fastai.github.io/fastkaggle/core.html#mk_dataset', - 'fastkaggle.core.nb_meta': 'https://fastai.github.io/fastkaggle/core.html#nb_meta', - 'fastkaggle.core.push_dataset': 'https://fastai.github.io/fastkaggle/core.html#push_dataset', - 'fastkaggle.core.push_notebook': 'https://fastai.github.io/fastkaggle/core.html#push_notebook', - 'fastkaggle.core.setup_comp': 'https://fastai.github.io/fastkaggle/core.html#setup_comp'}}} \ No newline at end of file + 'lib_path': 'fastkaggle'}, + 'syms': { 'fastkaggle.core': { 'fastkaggle.core.check_ds_exists': ('core.html#check_ds_exists', 'fastkaggle/core.py'), + 'fastkaggle.core.create_libs_datasets': ('core.html#create_libs_datasets', 'fastkaggle/core.py'), + 'fastkaggle.core.create_requirements_dataset': ( 'core.html#create_requirements_dataset', + 'fastkaggle/core.py'), + 'fastkaggle.core.get_dataset': ('core.html#get_dataset', 'fastkaggle/core.py'), + 'fastkaggle.core.get_local_ds_ver': ('core.html#get_local_ds_ver', 'fastkaggle/core.py'), + 'fastkaggle.core.get_pip_libraries': ('core.html#get_pip_libraries', 'fastkaggle/core.py'), + 'fastkaggle.core.get_pip_library': ('core.html#get_pip_library', 'fastkaggle/core.py'), + 'fastkaggle.core.import_kaggle': ('core.html#import_kaggle', 'fastkaggle/core.py'), + 'fastkaggle.core.mk_dataset': ('core.html#mk_dataset', 'fastkaggle/core.py'), + 'fastkaggle.core.nb_meta': ('core.html#nb_meta', 'fastkaggle/core.py'), + 'fastkaggle.core.push_dataset': ('core.html#push_dataset', 'fastkaggle/core.py'), + 'fastkaggle.core.push_notebook': ('core.html#push_notebook', 'fastkaggle/core.py'), + 'fastkaggle.core.setup_comp': ('core.html#setup_comp', 'fastkaggle/core.py'), + 'fastkaggle.core.setup_comp_directory': ('core.html#setup_comp_directory', 'fastkaggle/core.py')}}} diff --git a/fastkaggle/core.py b/fastkaggle/core.py index ad2d749..4528d5f 100644 --- a/fastkaggle/core.py +++ b/fastkaggle/core.py @@ -1,9 +1,11 @@ +"""API details for fastkaggle.""" + # AUTOGENERATED! DO NOT EDIT! File to edit: ../00_core.ipynb. # %% auto 0 -__all__ = ['iskaggle', 'import_kaggle', 'setup_comp', 'nb_meta', 'push_notebook', 'check_ds_exists', 'mk_dataset', 'get_dataset', - 'get_pip_library', 'get_pip_libraries', 'push_dataset', 'get_local_ds_ver', 'create_libs_datasets', - 'create_requirements_dataset'] +__all__ = ['iskaggle', 'import_kaggle', 'setup_comp', 'setup_comp_directory', 'nb_meta', 'push_notebook', 'check_ds_exists', + 'mk_dataset', 'get_dataset', 'get_pip_library', 'get_pip_libraries', 'push_dataset', 'get_local_ds_ver', + 'create_libs_datasets', 'create_requirements_dataset'] # %% ../00_core.ipynb 3 import os,json,subprocess, shutil @@ -42,7 +44,39 @@ def setup_comp(competition, install=''): zipfile.ZipFile(f'{competition}.zip').extractall(str(competition)) return path -# %% ../00_core.ipynb 10 +# %% ../00_core.ipynb 9 +def setup_comp_directory(competition, path_to_download, install=''): + """ + Setup the environment for a Kaggle competition by downloading competition data and optionally installing packages. + + Inputs: + - competition (str): The name of the Kaggle competition (e.g., 'titanic'). This is used to fetch the relevant dataset. + - path_to_download (str): The directory path where competition data should be downloaded. + The function will create the directory if it does not already exist. + - install (str, optional): Optional package to install via pip (e.g., 'fastai') if running in a Kaggle environment. Default is '' (no installation). + + Outputs: + - path (Path object): The directory path where the competition data is located. + This will be the directory containing the unzipped data files. + """ + # Specify the desired directory to download files + custom_dir = Path(path_to_download) # Replace with your desired path + custom_dir.mkdir(parents=True, exist_ok=True) # Create the directory if it doesn't exist + "Get a path to data for `competition`, downloading it if needed" + if iskaggle: + if install: + os.system(f'pip install -Uqq {install}') + return Path('../input')/competition + else: + path = custom_dir / Path(competition) + api = import_kaggle() + if not path.exists(): + import zipfile + api.competition_download_cli(competition, path=str(custom_dir)) # Download to custom directory + zipfile.ZipFile(f'{custom_dir}/{competition}.zip').extractall(path) # Extract files in custom directory + return path + +# %% ../00_core.ipynb 12 def nb_meta(user, id, title, file, competition=None, private=True, gpu=False, internet=True, linked_datasets=None): "Get the `dict` required for a kernel-metadata.json file" d = { @@ -61,7 +95,7 @@ def nb_meta(user, id, title, file, competition=None, private=True, gpu=False, in if competition: d["competition_sources"] = [f"competitions/{competition}"] return d -# %% ../00_core.ipynb 12 +# %% ../00_core.ipynb 14 def push_notebook(user, id, title, file, path='.', competition=None, private=True, gpu=False, internet=True, linked_datasets=None): "Push notebook `file` to Kaggle Notebooks" meta = nb_meta(user, id, title, file=file, competition=competition, private=private, gpu=gpu, internet=internet, linked_datasets=linked_datasets) @@ -72,7 +106,7 @@ def push_notebook(user, id, title, file, path='.', competition=None, private=Tru api = import_kaggle() api.kernels_push_cli(str(path)) -# %% ../00_core.ipynb 16 +# %% ../00_core.ipynb 18 def check_ds_exists(dataset_slug # Dataset slug (ie "zillow/zecon") ): '''Checks if a dataset exists in kaggle and returns boolean''' @@ -82,7 +116,7 @@ def check_ds_exists(dataset_slug # Dataset slug (ie "zillow/zecon") elif len(ds_search)==0: return False else: raise exception("Multiple datasets found - Check Manually") -# %% ../00_core.ipynb 17 +# %% ../00_core.ipynb 19 def mk_dataset(dataset_path, # Local path to create dataset in title, # Name of the dataset force=False, # Should it overwrite or error if exists? @@ -100,7 +134,7 @@ def mk_dataset(dataset_path, # Local path to create dataset in if upload: (dataset_path/'empty.txt').touch() api.dataset_create_new(str(dataset_path),public=True,dir_mode='zip',quiet=True) -# %% ../00_core.ipynb 19 +# %% ../00_core.ipynb 21 def get_dataset(dataset_path, # Local path to download dataset to dataset_slug, # Dataset slug (ie "zillow/zecon") unzip=True, # Should it unzip after downloading? @@ -119,7 +153,7 @@ def get_dataset(dataset_path, # Local path to download dataset to zipped_file.unlink() -# %% ../00_core.ipynb 20 +# %% ../00_core.ipynb 22 def get_pip_library(dataset_path, # Local path to download pip library to pip_library, # name of library for pip to install pip_cmd="pip" # pip base to use (ie "pip3" or "pip") @@ -129,7 +163,7 @@ def get_pip_library(dataset_path, # Local path to download pip library to process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) output, error = process.communicate() -# %% ../00_core.ipynb 21 +# %% ../00_core.ipynb 23 def get_pip_libraries(dataset_path, # Local path to download pip library to requirements_path, # path to requirements file pip_cmd="pip" # pip base to use (ie "pip3" or "pip") @@ -139,7 +173,7 @@ def get_pip_libraries(dataset_path, # Local path to download pip library to process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) output, error = process.communicate() -# %% ../00_core.ipynb 23 +# %% ../00_core.ipynb 25 def push_dataset(dataset_path, # Local path where dataset is stored version_comment # Comment associated with this dataset update ): @@ -147,7 +181,7 @@ def push_dataset(dataset_path, # Local path where dataset is stored api = import_kaggle() api.dataset_create_version(str(dataset_path),version_comment,dir_mode='zip',quiet=True) -# %% ../00_core.ipynb 24 +# %% ../00_core.ipynb 26 def get_local_ds_ver(lib_path, # Local path dataset is stored in lib # Name of library (ie "fastcore") ): @@ -163,7 +197,7 @@ def get_local_ds_ver(lib_path, # Local path dataset is stored in return re.search(f"(?<={wheel_lib_name}-)[\d+.]+\d",lib_whl[0].name.lower())[0] return None -# %% ../00_core.ipynb 26 +# %% ../00_core.ipynb 28 def create_libs_datasets(libs, # library or list of libraries to create datasets for (ie 'fastcore or ['fastcore','fastkaggle'] lib_path, # Local path to dl/create dataset username, # You username @@ -202,7 +236,7 @@ def create_libs_datasets(libs, # library or list of libraries to create datasets if clear_after: shutil.rmtree(local_path) print(f"{lib} | Complete") -# %% ../00_core.ipynb 27 +# %% ../00_core.ipynb 29 def create_requirements_dataset(req_fpath, # Path to requirements.txt file lib_path,#Local path to dl/create dataset title, # Title you want the kaggle dataset named