From 4686d5851d22615ac98d86d6d18012fa03874ecb Mon Sep 17 00:00:00 2001
From: leca <leca@foxarmy.org>
Date: Sat, 10 May 2025 20:44:10 +0300
Subject: [PATCH] wip

---
 main.py | 93 ++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 73 insertions(+), 20 deletions(-)

diff --git a/main.py b/main.py
index 06a8aff..56f0145 100644
--- a/main.py
+++ b/main.py
@@ -23,9 +23,21 @@ in the developer guides.
 ## Setup
 """
 
-import os
+import requests
+import re
 
-os.environ["KERAS_BACKEND"] = "tensorflow"
+from os import makedirs, walk, environ, path, listdir
+from dotenv import load_dotenv
+load_dotenv()
+
+# Constants
+IMAGE_HEIGHT = 70
+IMAGE_WIDTH = 200
+DOWNLOAD_PATH = environ.get("DOWNLOAD_PATH")
+TESTING_PATH = environ.get("TESTING_PATH")
+TRAINING_PATH = environ.get("TRAINING_PATH")
+PERCENT_OF_TESTING = int(environ.get("PERCENT_OF_TESTING"))
+environ["KERAS_BACKEND"] = "tensorflow"
 
 import numpy as np
 import matplotlib.pyplot as plt
@@ -37,34 +49,75 @@ import keras
 from keras import ops
 from keras import layers
 
-"""
-## Load the data: [Captcha Images](https://www.kaggle.com/fournierp/captcha-version-2-images)
-Let's download the data.
-"""
+def prepare_dirs():
+    """Create necessary directories for downloading and storing images."""
+    makedirs(DOWNLOAD_PATH, exist_ok=True)
+    makedirs(TESTING_PATH, exist_ok=True)
+    makedirs(TRAINING_PATH, exist_ok=True)
 
+def fetch_captcha(id):
+    """Fetch a captcha image by its ID and save it to the download path."""
+    try:
+        response = requests.get(f"{environ.get('CAPTCHA_AGGREGATOR_API')}/captcha/{id}")
+        response.raise_for_status()
+        captcha = response.json()["captcha"]
+        captcha_file_path = path.join(DOWNLOAD_PATH, f"{captcha['hash']}_{captcha['solution']}.jpeg")
+        with open(captcha_file_path, 'wb') as captcha_file:
+            captcha_file.write(b64decode(captcha['image']))
+    except requests.RequestException as e:
+        print(f"Error fetching captcha {id}: {e}")
 
-"""shell
-curl -LO https://github.com/AakashKumarNain/CaptchaCracker/raw/master/captcha_images_v2.zip
-unzip -qq captcha_images_v2.zip
-"""
+def search_saved_captcha(hash, path):
+    """Check if a captcha with the given hash exists in the specified path."""
+    regex = re.compile(f"{hash}_\\w{{6}}\\.jpeg")
+    for _, _, files in walk(path):
+        for file in files:
+            if regex.match(file):
+                return True
+    return False
 
+def search_and_download_new(captchas):
+    """Search for new captchas and download them if they don't already exist."""
+    for captcha in captchas:
+        id = captcha["id"]
+        hash = captcha["hash"]
+        if not (search_saved_captcha(hash, TRAINING_PATH) or 
+                search_saved_captcha(hash, TESTING_PATH) or 
+                search_saved_captcha(hash, DOWNLOAD_PATH)):
+            fetch_captcha(id)
 
-"""
-The dataset contains 1040 captcha files as `jpeg` images. The label for each sample is a string,
-the name of the file (minus the file extension).
-We will map each character in the string to an integer for training the model. Similary,
-we will need to map the predictions of the model back to strings. For this purpose
-we will maintain two dictionaries, mapping characters to integers, and integers to characters,
-respectively.
-"""
+def sort_datasets():
+    """Sort downloaded captchas into training and testing datasets."""
+    amount_of_new_data = len([file for file in listdir(DOWNLOAD_PATH) if path.isfile(path.join(DOWNLOAD_PATH, file))])
+    amount_to_send_to_test = round(amount_of_new_data * (PERCENT_OF_TESTING / 100))
+    
+    files = listdir(DOWNLOAD_PATH)
+    for index, file in enumerate(files):
+        if index < amount_to_send_to_test:
+            move(path.join(DOWNLOAD_PATH, file), TESTING_PATH)
+        else:
+            move(path.join(DOWNLOAD_PATH, file), TRAINING_PATH)
 
+def download_dataset():
+    """Download the dataset of captchas and sort them into training and testing sets."""
+    prepare_dirs()
+    try:
+        response = requests.get(f"{environ.get('CAPTCHA_AGGREGATOR_API')}/captcha/all")
+        response.raise_for_status()
+        captchas = response.json()["captchas"]
+        search_and_download_new(captchas)
+        sort_datasets()
+    except requests.RequestException as e:
+        print(f"Error downloading dataset: {e}")
+
+download_dataset()
 
 # Path to the data directory
 data_dir = Path("./datasets/training")
 
 # Get list of all the images
 images = sorted(list(map(str, list(data_dir.glob("*.jpeg")))))
-labels = [img.split(os.path.sep)[-1].split(".jpeg")[0].split("_")[1].upper() for img in images]
+labels = [img.split(path.sep)[-1].split(".jpeg")[0].split("_")[1].upper() for img in images]
 characters = set(char for label in labels for char in label)
 characters = sorted(list(characters))
 
@@ -316,7 +369,7 @@ def build_model():
 
     # Define the model
     model = keras.models.Model(
-        inputs=[input_img, labels], outputs=output, name="ocr_model_v1"
+        inputs=[input_img, labels], outputs=output, name="captcha_solver"
     )
     # Optimizer
     opt = keras.optimizers.Adam()