This commit is contained in:
leca 2025-05-10 20:44:10 +03:00
parent d231ec9e57
commit 4686d5851d
1 changed files with 73 additions and 20 deletions

93
main.py
View File

@ -23,9 +23,21 @@ in the developer guides.
## Setup
"""
import os
import requests
import re
os.environ["KERAS_BACKEND"] = "tensorflow"
from os import makedirs, walk, environ, path, listdir
from dotenv import load_dotenv
load_dotenv()
# Constants
IMAGE_HEIGHT = 70
IMAGE_WIDTH = 200
DOWNLOAD_PATH = environ.get("DOWNLOAD_PATH")
TESTING_PATH = environ.get("TESTING_PATH")
TRAINING_PATH = environ.get("TRAINING_PATH")
PERCENT_OF_TESTING = int(environ.get("PERCENT_OF_TESTING"))
environ["KERAS_BACKEND"] = "tensorflow"
import numpy as np
import matplotlib.pyplot as plt
@ -37,34 +49,75 @@ import keras
from keras import ops
from keras import layers
"""
## Load the data: [Captcha Images](https://www.kaggle.com/fournierp/captcha-version-2-images)
Let's download the data.
"""
def prepare_dirs():
"""Create necessary directories for downloading and storing images."""
makedirs(DOWNLOAD_PATH, exist_ok=True)
makedirs(TESTING_PATH, exist_ok=True)
makedirs(TRAINING_PATH, exist_ok=True)
def fetch_captcha(id):
"""Fetch a captcha image by its ID and save it to the download path."""
try:
response = requests.get(f"{environ.get('CAPTCHA_AGGREGATOR_API')}/captcha/{id}")
response.raise_for_status()
captcha = response.json()["captcha"]
captcha_file_path = path.join(DOWNLOAD_PATH, f"{captcha['hash']}_{captcha['solution']}.jpeg")
with open(captcha_file_path, 'wb') as captcha_file:
captcha_file.write(b64decode(captcha['image']))
except requests.RequestException as e:
print(f"Error fetching captcha {id}: {e}")
"""shell
curl -LO https://github.com/AakashKumarNain/CaptchaCracker/raw/master/captcha_images_v2.zip
unzip -qq captcha_images_v2.zip
"""
def search_saved_captcha(hash, path):
"""Check if a captcha with the given hash exists in the specified path."""
regex = re.compile(f"{hash}_\\w{{6}}\\.jpeg")
for _, _, files in walk(path):
for file in files:
if regex.match(file):
return True
return False
def search_and_download_new(captchas):
"""Search for new captchas and download them if they don't already exist."""
for captcha in captchas:
id = captcha["id"]
hash = captcha["hash"]
if not (search_saved_captcha(hash, TRAINING_PATH) or
search_saved_captcha(hash, TESTING_PATH) or
search_saved_captcha(hash, DOWNLOAD_PATH)):
fetch_captcha(id)
"""
The dataset contains 1040 captcha files as `jpeg` images. The label for each sample is a string,
the name of the file (minus the file extension).
We will map each character in the string to an integer for training the model. Similary,
we will need to map the predictions of the model back to strings. For this purpose
we will maintain two dictionaries, mapping characters to integers, and integers to characters,
respectively.
"""
def sort_datasets():
"""Sort downloaded captchas into training and testing datasets."""
amount_of_new_data = len([file for file in listdir(DOWNLOAD_PATH) if path.isfile(path.join(DOWNLOAD_PATH, file))])
amount_to_send_to_test = round(amount_of_new_data * (PERCENT_OF_TESTING / 100))
files = listdir(DOWNLOAD_PATH)
for index, file in enumerate(files):
if index < amount_to_send_to_test:
move(path.join(DOWNLOAD_PATH, file), TESTING_PATH)
else:
move(path.join(DOWNLOAD_PATH, file), TRAINING_PATH)
def download_dataset():
"""Download the dataset of captchas and sort them into training and testing sets."""
prepare_dirs()
try:
response = requests.get(f"{environ.get('CAPTCHA_AGGREGATOR_API')}/captcha/all")
response.raise_for_status()
captchas = response.json()["captchas"]
search_and_download_new(captchas)
sort_datasets()
except requests.RequestException as e:
print(f"Error downloading dataset: {e}")
download_dataset()
# Path to the data directory
data_dir = Path("./datasets/training")
# Get list of all the images
images = sorted(list(map(str, list(data_dir.glob("*.jpeg")))))
labels = [img.split(os.path.sep)[-1].split(".jpeg")[0].split("_")[1].upper() for img in images]
labels = [img.split(path.sep)[-1].split(".jpeg")[0].split("_")[1].upper() for img in images]
characters = set(char for label in labels for char in label)
characters = sorted(list(characters))
@ -316,7 +369,7 @@ def build_model():
# Define the model
model = keras.models.Model(
inputs=[input_img, labels], outputs=output, name="ocr_model_v1"
inputs=[input_img, labels], outputs=output, name="captcha_solver"
)
# Optimizer
opt = keras.optimizers.Adam()