from os import environ, makedirs, path, walk, listdir from shutil import move from dotenv import load_dotenv from base64 import b64decode import re import requests load_dotenv() DOWNLOAD_PATH=environ.get("DOWNLOAD_PATH") TESTING_PATH=environ.get("TESTING_PATH") TRAINING_PATH=environ.get("TRAINING_PATH") def prepare_dirs(): makedirs(DOWNLOAD_PATH, exist_ok=True) makedirs(TESTING_PATH, exist_ok=True) makedirs(TRAINING_PATH, exist_ok=True) def fetch_captcha(id): print(f"Fetching captcha with id {id}") captcha = requests.get(f"{environ.get('CAPTCHA_AGGREGATOR_API')}/captcha/{id}").json()["captcha"] with open(f"{DOWNLOAD_PATH}/{captcha['hash']}_{captcha['solution']}.jpeg", 'wb') as captcha_file: captcha_file.write(b64decode(captcha['image'])) def search_saved_captcha(hash, path): print(f"searching captcha with hash {hash} in {path}") regex = re.compile(hash + '_\\w{6}\\.jpeg') for _, _, files in walk(path): for file in files: if regex.match(file): return True return False def search_and_download_new(captchas): print(f"Searching and downloading new captchas") for captcha in captchas: id = captcha["id"] hash = captcha["hash"] training_exists = search_saved_captcha(hash, TRAINING_PATH) testing_exists = search_saved_captcha(hash, TESTING_PATH) new_exists = search_saved_captcha(hash, DOWNLOAD_PATH) if not training_exists and not testing_exists and not new_exists: fetch_captcha(id) def sort_datasets(): print(f"Sorting datasets") percent_of_testing = int(environ.get("PERCENT_OF_TESTING")) amount_of_new_data = len([file for file in listdir(DOWNLOAD_PATH) if path.isfile(f'{DOWNLOAD_PATH}/{file}')]) print(amount_of_new_data) amount_to_send_to_test = round(amount_of_new_data * (percent_of_testing / 100)) print(amount_to_send_to_test) for _, _, files in walk(DOWNLOAD_PATH): for index, file in enumerate(files): if index < amount_to_send_to_test: move(f"{DOWNLOAD_PATH}/{file}", TESTING_PATH) else: move(f"{DOWNLOAD_PATH}/{file}", TRAINING_PATH) def download_dataset(): prepare_dirs() captchas = requests.get(f"{environ.get('CAPTCHA_AGGREGATOR_API')}/captcha/all").json()["captchas"] search_and_download_new(captchas) sort_datasets() def train_nn(): pass if __name__ == "__main__": download_dataset() train_nn()