wip
This commit is contained in:
parent
d231ec9e57
commit
4686d5851d
93
main.py
93
main.py
|
@ -23,9 +23,21 @@ in the developer guides.
|
||||||
## Setup
|
## Setup
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import requests
|
||||||
|
import re
|
||||||
|
|
||||||
os.environ["KERAS_BACKEND"] = "tensorflow"
|
from os import makedirs, walk, environ, path, listdir
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
# Constants
|
||||||
|
IMAGE_HEIGHT = 70
|
||||||
|
IMAGE_WIDTH = 200
|
||||||
|
DOWNLOAD_PATH = environ.get("DOWNLOAD_PATH")
|
||||||
|
TESTING_PATH = environ.get("TESTING_PATH")
|
||||||
|
TRAINING_PATH = environ.get("TRAINING_PATH")
|
||||||
|
PERCENT_OF_TESTING = int(environ.get("PERCENT_OF_TESTING"))
|
||||||
|
environ["KERAS_BACKEND"] = "tensorflow"
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
@ -37,34 +49,75 @@ import keras
|
||||||
from keras import ops
|
from keras import ops
|
||||||
from keras import layers
|
from keras import layers
|
||||||
|
|
||||||
"""
|
def prepare_dirs():
|
||||||
## Load the data: [Captcha Images](https://www.kaggle.com/fournierp/captcha-version-2-images)
|
"""Create necessary directories for downloading and storing images."""
|
||||||
Let's download the data.
|
makedirs(DOWNLOAD_PATH, exist_ok=True)
|
||||||
"""
|
makedirs(TESTING_PATH, exist_ok=True)
|
||||||
|
makedirs(TRAINING_PATH, exist_ok=True)
|
||||||
|
|
||||||
|
def fetch_captcha(id):
|
||||||
|
"""Fetch a captcha image by its ID and save it to the download path."""
|
||||||
|
try:
|
||||||
|
response = requests.get(f"{environ.get('CAPTCHA_AGGREGATOR_API')}/captcha/{id}")
|
||||||
|
response.raise_for_status()
|
||||||
|
captcha = response.json()["captcha"]
|
||||||
|
captcha_file_path = path.join(DOWNLOAD_PATH, f"{captcha['hash']}_{captcha['solution']}.jpeg")
|
||||||
|
with open(captcha_file_path, 'wb') as captcha_file:
|
||||||
|
captcha_file.write(b64decode(captcha['image']))
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"Error fetching captcha {id}: {e}")
|
||||||
|
|
||||||
"""shell
|
def search_saved_captcha(hash, path):
|
||||||
curl -LO https://github.com/AakashKumarNain/CaptchaCracker/raw/master/captcha_images_v2.zip
|
"""Check if a captcha with the given hash exists in the specified path."""
|
||||||
unzip -qq captcha_images_v2.zip
|
regex = re.compile(f"{hash}_\\w{{6}}\\.jpeg")
|
||||||
"""
|
for _, _, files in walk(path):
|
||||||
|
for file in files:
|
||||||
|
if regex.match(file):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def search_and_download_new(captchas):
|
||||||
|
"""Search for new captchas and download them if they don't already exist."""
|
||||||
|
for captcha in captchas:
|
||||||
|
id = captcha["id"]
|
||||||
|
hash = captcha["hash"]
|
||||||
|
if not (search_saved_captcha(hash, TRAINING_PATH) or
|
||||||
|
search_saved_captcha(hash, TESTING_PATH) or
|
||||||
|
search_saved_captcha(hash, DOWNLOAD_PATH)):
|
||||||
|
fetch_captcha(id)
|
||||||
|
|
||||||
"""
|
def sort_datasets():
|
||||||
The dataset contains 1040 captcha files as `jpeg` images. The label for each sample is a string,
|
"""Sort downloaded captchas into training and testing datasets."""
|
||||||
the name of the file (minus the file extension).
|
amount_of_new_data = len([file for file in listdir(DOWNLOAD_PATH) if path.isfile(path.join(DOWNLOAD_PATH, file))])
|
||||||
We will map each character in the string to an integer for training the model. Similary,
|
amount_to_send_to_test = round(amount_of_new_data * (PERCENT_OF_TESTING / 100))
|
||||||
we will need to map the predictions of the model back to strings. For this purpose
|
|
||||||
we will maintain two dictionaries, mapping characters to integers, and integers to characters,
|
files = listdir(DOWNLOAD_PATH)
|
||||||
respectively.
|
for index, file in enumerate(files):
|
||||||
"""
|
if index < amount_to_send_to_test:
|
||||||
|
move(path.join(DOWNLOAD_PATH, file), TESTING_PATH)
|
||||||
|
else:
|
||||||
|
move(path.join(DOWNLOAD_PATH, file), TRAINING_PATH)
|
||||||
|
|
||||||
|
def download_dataset():
|
||||||
|
"""Download the dataset of captchas and sort them into training and testing sets."""
|
||||||
|
prepare_dirs()
|
||||||
|
try:
|
||||||
|
response = requests.get(f"{environ.get('CAPTCHA_AGGREGATOR_API')}/captcha/all")
|
||||||
|
response.raise_for_status()
|
||||||
|
captchas = response.json()["captchas"]
|
||||||
|
search_and_download_new(captchas)
|
||||||
|
sort_datasets()
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"Error downloading dataset: {e}")
|
||||||
|
|
||||||
|
download_dataset()
|
||||||
|
|
||||||
# Path to the data directory
|
# Path to the data directory
|
||||||
data_dir = Path("./datasets/training")
|
data_dir = Path("./datasets/training")
|
||||||
|
|
||||||
# Get list of all the images
|
# Get list of all the images
|
||||||
images = sorted(list(map(str, list(data_dir.glob("*.jpeg")))))
|
images = sorted(list(map(str, list(data_dir.glob("*.jpeg")))))
|
||||||
labels = [img.split(os.path.sep)[-1].split(".jpeg")[0].split("_")[1].upper() for img in images]
|
labels = [img.split(path.sep)[-1].split(".jpeg")[0].split("_")[1].upper() for img in images]
|
||||||
characters = set(char for label in labels for char in label)
|
characters = set(char for label in labels for char in label)
|
||||||
characters = sorted(list(characters))
|
characters = sorted(list(characters))
|
||||||
|
|
||||||
|
@ -316,7 +369,7 @@ def build_model():
|
||||||
|
|
||||||
# Define the model
|
# Define the model
|
||||||
model = keras.models.Model(
|
model = keras.models.Model(
|
||||||
inputs=[input_img, labels], outputs=output, name="ocr_model_v1"
|
inputs=[input_img, labels], outputs=output, name="captcha_solver"
|
||||||
)
|
)
|
||||||
# Optimizer
|
# Optimizer
|
||||||
opt = keras.optimizers.Adam()
|
opt = keras.optimizers.Adam()
|
||||||
|
|
Loading…
Reference in New Issue