dataset fetch and soring

2025-05-04 00:56:55 +03:00
parent f1293aadab
commit 60fa54b76e
3 changed files with 87 additions and 0 deletions
--- a/main.py
+++ b/main.py
@@ -0,0 +1,78 @@
+from os import environ, makedirs, path, walk, listdir
+from shutil import move
+from dotenv import load_dotenv
+from base64 import b64decode
+import re
+import requests
+
+load_dotenv()
+
+DOWNLOAD_PATH=environ.get("DOWNLOAD_PATH")
+TESTING_PATH=environ.get("TESTING_PATH")
+TRAINING_PATH=environ.get("TRAINING_PATH")
+
+def prepare_dirs():
+    makedirs(DOWNLOAD_PATH, exist_ok=True)
+    makedirs(TESTING_PATH, exist_ok=True)
+    makedirs(TRAINING_PATH, exist_ok=True)
+
+def fetch_captcha(id):
+    print(f"Fetching captcha with id {id}")
+    captcha = requests.get(f"{environ.get('CAPTCHA_AGGREGATOR_API')}/captcha/{id}").json()["captcha"]
+
+    with open(f"{DOWNLOAD_PATH}/{captcha['hash']}_{captcha['solution']}.jpeg", 'wb') as captcha_file:
+        captcha_file.write(b64decode(captcha['image']))
+
+def search_saved_captcha(hash, path):
+    print(f"searching captcha with hash {hash} in {path}")
+    regex = re.compile(hash + '_\\w{6}\\.jpeg')
+
+    for _, _, files in walk(path):
+        for file in files:
+            if regex.match(file):
+                return True
+    return False
+
+def search_and_download_new(captchas):
+    print(f"Searching and downloading new captchas")
+    for captcha in captchas:
+        id = captcha["id"]
+        hash = captcha["hash"]
+        training_exists = search_saved_captcha(hash, TRAINING_PATH)
+        testing_exists = search_saved_captcha(hash, TESTING_PATH)
+        new_exists = search_saved_captcha(hash, DOWNLOAD_PATH)
+        if not training_exists and not testing_exists and not new_exists:
+            fetch_captcha(id)
+
+def sort_datasets():
+    print(f"Sorting datasets")
+    percent_of_testing = int(environ.get("PERCENT_OF_TESTING"))
+    amount_of_new_data = len([file for file in listdir(DOWNLOAD_PATH) if path.isfile(f'{DOWNLOAD_PATH}/{file}')])
+    print(amount_of_new_data)
+    amount_to_send_to_test = round(amount_of_new_data * (percent_of_testing / 100))
+    print(amount_to_send_to_test)
+    for _, _, files in walk(DOWNLOAD_PATH):
+        for index, file in enumerate(files):
+            if index < amount_to_send_to_test:
+                move(f"{DOWNLOAD_PATH}/{file}", TESTING_PATH)
+            else:
+                move(f"{DOWNLOAD_PATH}/{file}", TRAINING_PATH)
+
+def download_dataset():
+    prepare_dirs()
+
+    captchas = requests.get(f"{environ.get('CAPTCHA_AGGREGATOR_API')}/captcha/all").json()["captchas"]
+
+    search_and_download_new(captchas)
+    sort_datasets()
+
+    
+
+def train_nn():
+    pass
+
+if __name__ == "__main__":
+    download_dataset()
+    train_nn()
+    
+
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,7 @@
+certifi==2025.4.26
+charset-normalizer==3.4.2
+dotenv==0.9.9
+idna==3.10
+python-dotenv==1.1.0
+requests==2.32.3
+urllib3==2.4.0
--- a/sample.env
+++ b/sample.env
@@ -0,0 +1,2 @@
+CAPTCHA_AGGREGATOR_API=https://captcha.foxarmy.org/api
+PERCENT_OF_TESTING=10