Compare commits
No commits in common. "d1be49d74090e8204b2ad5d3bb10085d334a1cd2" and "f1293aadababfeb89cecf9a8d9a1d3bf3be3091d" have entirely different histories.
d1be49d740
...
f1293aadab
|
@ -160,6 +160,3 @@ cython_debug/
|
||||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
#.idea/
|
#.idea/
|
||||||
|
|
||||||
|
|
||||||
.env
|
|
||||||
datasets
|
|
||||||
|
|
78
main.py
78
main.py
|
@ -1,78 +0,0 @@
|
||||||
from os import environ, makedirs, path, walk, listdir
|
|
||||||
from shutil import move
|
|
||||||
from dotenv import load_dotenv
|
|
||||||
from base64 import b64decode
|
|
||||||
import re
|
|
||||||
import requests
|
|
||||||
|
|
||||||
load_dotenv()
|
|
||||||
|
|
||||||
DOWNLOAD_PATH=environ.get("DOWNLOAD_PATH")
|
|
||||||
TESTING_PATH=environ.get("TESTING_PATH")
|
|
||||||
TRAINING_PATH=environ.get("TRAINING_PATH")
|
|
||||||
|
|
||||||
def prepare_dirs():
|
|
||||||
makedirs(DOWNLOAD_PATH, exist_ok=True)
|
|
||||||
makedirs(TESTING_PATH, exist_ok=True)
|
|
||||||
makedirs(TRAINING_PATH, exist_ok=True)
|
|
||||||
|
|
||||||
def fetch_captcha(id):
|
|
||||||
print(f"Fetching captcha with id {id}")
|
|
||||||
captcha = requests.get(f"{environ.get('CAPTCHA_AGGREGATOR_API')}/captcha/{id}").json()["captcha"]
|
|
||||||
|
|
||||||
with open(f"{DOWNLOAD_PATH}/{captcha['hash']}_{captcha['solution']}.jpeg", 'wb') as captcha_file:
|
|
||||||
captcha_file.write(b64decode(captcha['image']))
|
|
||||||
|
|
||||||
def search_saved_captcha(hash, path):
|
|
||||||
print(f"searching captcha with hash {hash} in {path}")
|
|
||||||
regex = re.compile(hash + '_\\w{6}\\.jpeg')
|
|
||||||
|
|
||||||
for _, _, files in walk(path):
|
|
||||||
for file in files:
|
|
||||||
if regex.match(file):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def search_and_download_new(captchas):
|
|
||||||
print(f"Searching and downloading new captchas")
|
|
||||||
for captcha in captchas:
|
|
||||||
id = captcha["id"]
|
|
||||||
hash = captcha["hash"]
|
|
||||||
training_exists = search_saved_captcha(hash, TRAINING_PATH)
|
|
||||||
testing_exists = search_saved_captcha(hash, TESTING_PATH)
|
|
||||||
new_exists = search_saved_captcha(hash, DOWNLOAD_PATH)
|
|
||||||
if not training_exists and not testing_exists and not new_exists:
|
|
||||||
fetch_captcha(id)
|
|
||||||
|
|
||||||
def sort_datasets():
|
|
||||||
print(f"Sorting datasets")
|
|
||||||
percent_of_testing = int(environ.get("PERCENT_OF_TESTING"))
|
|
||||||
amount_of_new_data = len([file for file in listdir(DOWNLOAD_PATH) if path.isfile(f'{DOWNLOAD_PATH}/{file}')])
|
|
||||||
print(amount_of_new_data)
|
|
||||||
amount_to_send_to_test = round(amount_of_new_data * (percent_of_testing / 100))
|
|
||||||
print(amount_to_send_to_test)
|
|
||||||
for _, _, files in walk(DOWNLOAD_PATH):
|
|
||||||
for index, file in enumerate(files):
|
|
||||||
if index < amount_to_send_to_test:
|
|
||||||
move(f"{DOWNLOAD_PATH}/{file}", TESTING_PATH)
|
|
||||||
else:
|
|
||||||
move(f"{DOWNLOAD_PATH}/{file}", TRAINING_PATH)
|
|
||||||
|
|
||||||
def download_dataset():
|
|
||||||
prepare_dirs()
|
|
||||||
|
|
||||||
captchas = requests.get(f"{environ.get('CAPTCHA_AGGREGATOR_API')}/captcha/all").json()["captchas"]
|
|
||||||
|
|
||||||
search_and_download_new(captchas)
|
|
||||||
sort_datasets()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def train_nn():
|
|
||||||
pass
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
download_dataset()
|
|
||||||
train_nn()
|
|
||||||
|
|
||||||
|
|
|
@ -1,7 +0,0 @@
|
||||||
certifi==2025.4.26
|
|
||||||
charset-normalizer==3.4.2
|
|
||||||
dotenv==0.9.9
|
|
||||||
idna==3.10
|
|
||||||
python-dotenv==1.1.0
|
|
||||||
requests==2.32.3
|
|
||||||
urllib3==2.4.0
|
|
10
sample.env
10
sample.env
|
@ -1,10 +0,0 @@
|
||||||
CAPTCHA_AGGREGATOR_API=https://captcha.foxarmy.org/api
|
|
||||||
|
|
||||||
#Percentage of all downloaded to be sent to test dataset
|
|
||||||
PERCENT_OF_TESTING=10
|
|
||||||
#Place to download new data
|
|
||||||
DOWNLOAD_PATH=datasets/new
|
|
||||||
#Place to save testing dataset
|
|
||||||
TESTING_PATH=datasets/testing
|
|
||||||
#Place to save training dataset
|
|
||||||
TRAINING_PATH=datasets/training
|
|
Loading…
Reference in New Issue