84 lines
3.3 KiB
Python
84 lines
3.3 KiB
Python
import requests
|
|
import re
|
|
|
|
from pathlib import Path
|
|
from src.config import config
|
|
from base64 import b64decode
|
|
from os import makedirs, walk, path
|
|
|
|
###############
|
|
#Dataset tools#
|
|
###############
|
|
|
|
def prepare_dirs():
|
|
# makedirs(config.DOWNLOAD_PATH, exist_ok=True)
|
|
# makedirs(config.TESTING_PATH, exist_ok=True)
|
|
# makedirs(config.TRAINING_PATH, exist_ok=True)
|
|
makedirs(config.DATASET_PATH, exist_ok=True)
|
|
def fetch_captcha(id: str) -> None:
|
|
"""Fetch a captcha image by its ID and save it to the download path, labeled with its solution."""
|
|
try:
|
|
response = requests.get(f"{config.API}/captcha/{id}")
|
|
response.raise_for_status()
|
|
captcha = response.json()["captcha"]
|
|
captcha_file_path = path.join(config.DATASET_PATH, f"{captcha['hash']}_{captcha['solution']}.jpeg")
|
|
with open(captcha_file_path, 'wb') as captcha_file:
|
|
captcha_file.write(b64decode(captcha['image']))
|
|
except requests.RequestException as e:
|
|
print(f"Error fetching captcha {id}: {e}")
|
|
|
|
def search_saved_captcha(hash: str, path: str) -> bool:
|
|
"""Check if a captcha with the given hash exists in the specified path."""
|
|
regex = re.compile(f"{hash}_\\w{{6}}\\.jpeg")
|
|
for _, _, files in walk(path):
|
|
for file in files:
|
|
if regex.match(file):
|
|
return True
|
|
return False
|
|
|
|
def search_and_download_new(captchas: list[dict]) -> None:
|
|
"""Search for new captchas and download them if they don't already exist."""
|
|
for captcha in captchas:
|
|
id = captcha["id"]
|
|
hash = captcha["hash"]
|
|
if not search_saved_captcha(hash, config.DATASET_PATH):
|
|
fetch_captcha(id)
|
|
|
|
# def sort_datasets() -> None:
|
|
# """Sort downloaded captchas into training and testing datasets."""
|
|
# amount_of_new_data = len([file for file in listdir(config.DOWNLOAD_PATH) if path.isfile(path.join(config.DOWNLOAD_PATH, file))])
|
|
# amount_to_send_to_test = round(amount_of_new_data * (config.TESTING_PERCENT / 100))
|
|
#
|
|
# files = listdir(config.DOWNLOAD_PATH)
|
|
# for index, file in enumerate(files):
|
|
# if index < amount_to_send_to_test:
|
|
# move(path.join(config.DOWNLOAD_PATH, file), config.TESTING_PATH)
|
|
# else:
|
|
# move(path.join(config.DOWNLOAD_PATH, file), config.TRAINING_PATH)
|
|
|
|
def download_dataset() -> None:
|
|
"""Download the dataset of captchas and sort them into training and testing sets."""
|
|
prepare_dirs()
|
|
try:
|
|
response = requests.get(f"{config.API}/captcha/all")
|
|
response.raise_for_status()
|
|
captchas = response.json()["captchas"]
|
|
search_and_download_new(captchas)
|
|
# sort_datasets()
|
|
except requests.RequestException as e:
|
|
print(f"Error downloading dataset: {e}")
|
|
|
|
def load_datasets() -> tuple[list, list, list]:
|
|
# target_size = (config.IMAGE_WIDTH*config.IMAGE_HEIGHT)
|
|
|
|
images = sorted(list(map(str, list(Path(config.DATASET_PATH).glob("*.jpeg")))))
|
|
labels = [img.split(path.sep)[-1].split(".jpeg")[0].split("_")[1] for img in images]
|
|
characters = set(char for label in labels for char in label)
|
|
characters = sorted(list(characters))
|
|
|
|
print("Number of images found: ", len(images))
|
|
print("Number of labels found: ", len(labels))
|
|
print("Number of unique characters: ", len(characters))
|
|
print("Characters present: ", characters)
|
|
|
|
return images, labels, characters |