diff --git a/main.py b/main.py new file mode 100644 index 0000000..031c908 --- /dev/null +++ b/main.py @@ -0,0 +1,78 @@ +from os import environ, makedirs, path, walk, listdir +from shutil import move +from dotenv import load_dotenv +from base64 import b64decode +import re +import requests + +load_dotenv() + +DOWNLOAD_PATH=environ.get("DOWNLOAD_PATH") +TESTING_PATH=environ.get("TESTING_PATH") +TRAINING_PATH=environ.get("TRAINING_PATH") + +def prepare_dirs(): + makedirs(DOWNLOAD_PATH, exist_ok=True) + makedirs(TESTING_PATH, exist_ok=True) + makedirs(TRAINING_PATH, exist_ok=True) + +def fetch_captcha(id): + print(f"Fetching captcha with id {id}") + captcha = requests.get(f"{environ.get('CAPTCHA_AGGREGATOR_API')}/captcha/{id}").json()["captcha"] + + with open(f"{DOWNLOAD_PATH}/{captcha['hash']}_{captcha['solution']}.jpeg", 'wb') as captcha_file: + captcha_file.write(b64decode(captcha['image'])) + +def search_saved_captcha(hash, path): + print(f"searching captcha with hash {hash} in {path}") + regex = re.compile(hash + '_\\w{6}\\.jpeg') + + for _, _, files in walk(path): + for file in files: + if regex.match(file): + return True + return False + +def search_and_download_new(captchas): + print(f"Searching and downloading new captchas") + for captcha in captchas: + id = captcha["id"] + hash = captcha["hash"] + training_exists = search_saved_captcha(hash, TRAINING_PATH) + testing_exists = search_saved_captcha(hash, TESTING_PATH) + new_exists = search_saved_captcha(hash, DOWNLOAD_PATH) + if not training_exists and not testing_exists and not new_exists: + fetch_captcha(id) + +def sort_datasets(): + print(f"Sorting datasets") + percent_of_testing = int(environ.get("PERCENT_OF_TESTING")) + amount_of_new_data = len([file for file in listdir(DOWNLOAD_PATH) if path.isfile(f'{DOWNLOAD_PATH}/{file}')]) + print(amount_of_new_data) + amount_to_send_to_test = round(amount_of_new_data * (percent_of_testing / 100)) + print(amount_to_send_to_test) + for _, _, files in walk(DOWNLOAD_PATH): + for index, file in enumerate(files): + if index < amount_to_send_to_test: + move(f"{DOWNLOAD_PATH}/{file}", TESTING_PATH) + else: + move(f"{DOWNLOAD_PATH}/{file}", TRAINING_PATH) + +def download_dataset(): + prepare_dirs() + + captchas = requests.get(f"{environ.get('CAPTCHA_AGGREGATOR_API')}/captcha/all").json()["captchas"] + + search_and_download_new(captchas) + sort_datasets() + + + +def train_nn(): + pass + +if __name__ == "__main__": + download_dataset() + train_nn() + + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5051e6d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +certifi==2025.4.26 +charset-normalizer==3.4.2 +dotenv==0.9.9 +idna==3.10 +python-dotenv==1.1.0 +requests==2.32.3 +urllib3==2.4.0 diff --git a/sample.env b/sample.env new file mode 100644 index 0000000..c366ee2 --- /dev/null +++ b/sample.env @@ -0,0 +1,2 @@ +CAPTCHA_AGGREGATOR_API=https://captcha.foxarmy.org/api +PERCENT_OF_TESTING=10