dataset fetch and soring

This commit is contained in:
leca 2025-05-04 00:56:55 +03:00
parent f1293aadab
commit 60fa54b76e
3 changed files with 87 additions and 0 deletions

78
main.py Normal file
View File

@ -0,0 +1,78 @@
from os import environ, makedirs, path, walk, listdir
from shutil import move
from dotenv import load_dotenv
from base64 import b64decode
import re
import requests
load_dotenv()
DOWNLOAD_PATH=environ.get("DOWNLOAD_PATH")
TESTING_PATH=environ.get("TESTING_PATH")
TRAINING_PATH=environ.get("TRAINING_PATH")
def prepare_dirs():
makedirs(DOWNLOAD_PATH, exist_ok=True)
makedirs(TESTING_PATH, exist_ok=True)
makedirs(TRAINING_PATH, exist_ok=True)
def fetch_captcha(id):
print(f"Fetching captcha with id {id}")
captcha = requests.get(f"{environ.get('CAPTCHA_AGGREGATOR_API')}/captcha/{id}").json()["captcha"]
with open(f"{DOWNLOAD_PATH}/{captcha['hash']}_{captcha['solution']}.jpeg", 'wb') as captcha_file:
captcha_file.write(b64decode(captcha['image']))
def search_saved_captcha(hash, path):
print(f"searching captcha with hash {hash} in {path}")
regex = re.compile(hash + '_\\w{6}\\.jpeg')
for _, _, files in walk(path):
for file in files:
if regex.match(file):
return True
return False
def search_and_download_new(captchas):
print(f"Searching and downloading new captchas")
for captcha in captchas:
id = captcha["id"]
hash = captcha["hash"]
training_exists = search_saved_captcha(hash, TRAINING_PATH)
testing_exists = search_saved_captcha(hash, TESTING_PATH)
new_exists = search_saved_captcha(hash, DOWNLOAD_PATH)
if not training_exists and not testing_exists and not new_exists:
fetch_captcha(id)
def sort_datasets():
print(f"Sorting datasets")
percent_of_testing = int(environ.get("PERCENT_OF_TESTING"))
amount_of_new_data = len([file for file in listdir(DOWNLOAD_PATH) if path.isfile(f'{DOWNLOAD_PATH}/{file}')])
print(amount_of_new_data)
amount_to_send_to_test = round(amount_of_new_data * (percent_of_testing / 100))
print(amount_to_send_to_test)
for _, _, files in walk(DOWNLOAD_PATH):
for index, file in enumerate(files):
if index < amount_to_send_to_test:
move(f"{DOWNLOAD_PATH}/{file}", TESTING_PATH)
else:
move(f"{DOWNLOAD_PATH}/{file}", TRAINING_PATH)
def download_dataset():
prepare_dirs()
captchas = requests.get(f"{environ.get('CAPTCHA_AGGREGATOR_API')}/captcha/all").json()["captchas"]
search_and_download_new(captchas)
sort_datasets()
def train_nn():
pass
if __name__ == "__main__":
download_dataset()
train_nn()

7
requirements.txt Normal file
View File

@ -0,0 +1,7 @@
certifi==2025.4.26
charset-normalizer==3.4.2
dotenv==0.9.9
idna==3.10
python-dotenv==1.1.0
requests==2.32.3
urllib3==2.4.0

2
sample.env Normal file
View File

@ -0,0 +1,2 @@
CAPTCHA_AGGREGATOR_API=https://captcha.foxarmy.org/api
PERCENT_OF_TESTING=10