From 4686d5851d22615ac98d86d6d18012fa03874ecb Mon Sep 17 00:00:00 2001 From: leca Date: Sat, 10 May 2025 20:44:10 +0300 Subject: [PATCH] wip --- main.py | 93 ++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 73 insertions(+), 20 deletions(-) diff --git a/main.py b/main.py index 06a8aff..56f0145 100644 --- a/main.py +++ b/main.py @@ -23,9 +23,21 @@ in the developer guides. ## Setup """ -import os +import requests +import re -os.environ["KERAS_BACKEND"] = "tensorflow" +from os import makedirs, walk, environ, path, listdir +from dotenv import load_dotenv +load_dotenv() + +# Constants +IMAGE_HEIGHT = 70 +IMAGE_WIDTH = 200 +DOWNLOAD_PATH = environ.get("DOWNLOAD_PATH") +TESTING_PATH = environ.get("TESTING_PATH") +TRAINING_PATH = environ.get("TRAINING_PATH") +PERCENT_OF_TESTING = int(environ.get("PERCENT_OF_TESTING")) +environ["KERAS_BACKEND"] = "tensorflow" import numpy as np import matplotlib.pyplot as plt @@ -37,34 +49,75 @@ import keras from keras import ops from keras import layers -""" -## Load the data: [Captcha Images](https://www.kaggle.com/fournierp/captcha-version-2-images) -Let's download the data. -""" +def prepare_dirs(): + """Create necessary directories for downloading and storing images.""" + makedirs(DOWNLOAD_PATH, exist_ok=True) + makedirs(TESTING_PATH, exist_ok=True) + makedirs(TRAINING_PATH, exist_ok=True) +def fetch_captcha(id): + """Fetch a captcha image by its ID and save it to the download path.""" + try: + response = requests.get(f"{environ.get('CAPTCHA_AGGREGATOR_API')}/captcha/{id}") + response.raise_for_status() + captcha = response.json()["captcha"] + captcha_file_path = path.join(DOWNLOAD_PATH, f"{captcha['hash']}_{captcha['solution']}.jpeg") + with open(captcha_file_path, 'wb') as captcha_file: + captcha_file.write(b64decode(captcha['image'])) + except requests.RequestException as e: + print(f"Error fetching captcha {id}: {e}") -"""shell -curl -LO https://github.com/AakashKumarNain/CaptchaCracker/raw/master/captcha_images_v2.zip -unzip -qq captcha_images_v2.zip -""" +def search_saved_captcha(hash, path): + """Check if a captcha with the given hash exists in the specified path.""" + regex = re.compile(f"{hash}_\\w{{6}}\\.jpeg") + for _, _, files in walk(path): + for file in files: + if regex.match(file): + return True + return False +def search_and_download_new(captchas): + """Search for new captchas and download them if they don't already exist.""" + for captcha in captchas: + id = captcha["id"] + hash = captcha["hash"] + if not (search_saved_captcha(hash, TRAINING_PATH) or + search_saved_captcha(hash, TESTING_PATH) or + search_saved_captcha(hash, DOWNLOAD_PATH)): + fetch_captcha(id) -""" -The dataset contains 1040 captcha files as `jpeg` images. The label for each sample is a string, -the name of the file (minus the file extension). -We will map each character in the string to an integer for training the model. Similary, -we will need to map the predictions of the model back to strings. For this purpose -we will maintain two dictionaries, mapping characters to integers, and integers to characters, -respectively. -""" +def sort_datasets(): + """Sort downloaded captchas into training and testing datasets.""" + amount_of_new_data = len([file for file in listdir(DOWNLOAD_PATH) if path.isfile(path.join(DOWNLOAD_PATH, file))]) + amount_to_send_to_test = round(amount_of_new_data * (PERCENT_OF_TESTING / 100)) + + files = listdir(DOWNLOAD_PATH) + for index, file in enumerate(files): + if index < amount_to_send_to_test: + move(path.join(DOWNLOAD_PATH, file), TESTING_PATH) + else: + move(path.join(DOWNLOAD_PATH, file), TRAINING_PATH) +def download_dataset(): + """Download the dataset of captchas and sort them into training and testing sets.""" + prepare_dirs() + try: + response = requests.get(f"{environ.get('CAPTCHA_AGGREGATOR_API')}/captcha/all") + response.raise_for_status() + captchas = response.json()["captchas"] + search_and_download_new(captchas) + sort_datasets() + except requests.RequestException as e: + print(f"Error downloading dataset: {e}") + +download_dataset() # Path to the data directory data_dir = Path("./datasets/training") # Get list of all the images images = sorted(list(map(str, list(data_dir.glob("*.jpeg"))))) -labels = [img.split(os.path.sep)[-1].split(".jpeg")[0].split("_")[1].upper() for img in images] +labels = [img.split(path.sep)[-1].split(".jpeg")[0].split("_")[1].upper() for img in images] characters = set(char for label in labels for char in label) characters = sorted(list(characters)) @@ -316,7 +369,7 @@ def build_model(): # Define the model model = keras.models.Model( - inputs=[input_img, labels], outputs=output, name="ocr_model_v1" + inputs=[input_img, labels], outputs=output, name="captcha_solver" ) # Optimizer opt = keras.optimizers.Adam()