Compare commits
	
		
			2 Commits
		
	
	
		
			f1293aadab
			...
			d1be49d740
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| d1be49d740 | |||
| 60fa54b76e | 
							
								
								
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@@ -160,3 +160,6 @@ cython_debug/
 | 
				
			|||||||
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 | 
					#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 | 
				
			||||||
#.idea/
 | 
					#.idea/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.env
 | 
				
			||||||
 | 
					datasets
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										78
									
								
								main.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										78
									
								
								main.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,78 @@
 | 
				
			|||||||
 | 
					from os import environ, makedirs, path, walk, listdir
 | 
				
			||||||
 | 
					from shutil import move
 | 
				
			||||||
 | 
					from dotenv import load_dotenv
 | 
				
			||||||
 | 
					from base64 import b64decode
 | 
				
			||||||
 | 
					import re
 | 
				
			||||||
 | 
					import requests
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					load_dotenv()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DOWNLOAD_PATH=environ.get("DOWNLOAD_PATH")
 | 
				
			||||||
 | 
					TESTING_PATH=environ.get("TESTING_PATH")
 | 
				
			||||||
 | 
					TRAINING_PATH=environ.get("TRAINING_PATH")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def prepare_dirs():
 | 
				
			||||||
 | 
					    makedirs(DOWNLOAD_PATH, exist_ok=True)
 | 
				
			||||||
 | 
					    makedirs(TESTING_PATH, exist_ok=True)
 | 
				
			||||||
 | 
					    makedirs(TRAINING_PATH, exist_ok=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def fetch_captcha(id):
 | 
				
			||||||
 | 
					    print(f"Fetching captcha with id {id}")
 | 
				
			||||||
 | 
					    captcha = requests.get(f"{environ.get('CAPTCHA_AGGREGATOR_API')}/captcha/{id}").json()["captcha"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with open(f"{DOWNLOAD_PATH}/{captcha['hash']}_{captcha['solution']}.jpeg", 'wb') as captcha_file:
 | 
				
			||||||
 | 
					        captcha_file.write(b64decode(captcha['image']))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def search_saved_captcha(hash, path):
 | 
				
			||||||
 | 
					    print(f"searching captcha with hash {hash} in {path}")
 | 
				
			||||||
 | 
					    regex = re.compile(hash + '_\\w{6}\\.jpeg')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for _, _, files in walk(path):
 | 
				
			||||||
 | 
					        for file in files:
 | 
				
			||||||
 | 
					            if regex.match(file):
 | 
				
			||||||
 | 
					                return True
 | 
				
			||||||
 | 
					    return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def search_and_download_new(captchas):
 | 
				
			||||||
 | 
					    print(f"Searching and downloading new captchas")
 | 
				
			||||||
 | 
					    for captcha in captchas:
 | 
				
			||||||
 | 
					        id = captcha["id"]
 | 
				
			||||||
 | 
					        hash = captcha["hash"]
 | 
				
			||||||
 | 
					        training_exists = search_saved_captcha(hash, TRAINING_PATH)
 | 
				
			||||||
 | 
					        testing_exists = search_saved_captcha(hash, TESTING_PATH)
 | 
				
			||||||
 | 
					        new_exists = search_saved_captcha(hash, DOWNLOAD_PATH)
 | 
				
			||||||
 | 
					        if not training_exists and not testing_exists and not new_exists:
 | 
				
			||||||
 | 
					            fetch_captcha(id)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def sort_datasets():
 | 
				
			||||||
 | 
					    print(f"Sorting datasets")
 | 
				
			||||||
 | 
					    percent_of_testing = int(environ.get("PERCENT_OF_TESTING"))
 | 
				
			||||||
 | 
					    amount_of_new_data = len([file for file in listdir(DOWNLOAD_PATH) if path.isfile(f'{DOWNLOAD_PATH}/{file}')])
 | 
				
			||||||
 | 
					    print(amount_of_new_data)
 | 
				
			||||||
 | 
					    amount_to_send_to_test = round(amount_of_new_data * (percent_of_testing / 100))
 | 
				
			||||||
 | 
					    print(amount_to_send_to_test)
 | 
				
			||||||
 | 
					    for _, _, files in walk(DOWNLOAD_PATH):
 | 
				
			||||||
 | 
					        for index, file in enumerate(files):
 | 
				
			||||||
 | 
					            if index < amount_to_send_to_test:
 | 
				
			||||||
 | 
					                move(f"{DOWNLOAD_PATH}/{file}", TESTING_PATH)
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                move(f"{DOWNLOAD_PATH}/{file}", TRAINING_PATH)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def download_dataset():
 | 
				
			||||||
 | 
					    prepare_dirs()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    captchas = requests.get(f"{environ.get('CAPTCHA_AGGREGATOR_API')}/captcha/all").json()["captchas"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    search_and_download_new(captchas)
 | 
				
			||||||
 | 
					    sort_datasets()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def train_nn():
 | 
				
			||||||
 | 
					    pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == "__main__":
 | 
				
			||||||
 | 
					    download_dataset()
 | 
				
			||||||
 | 
					    train_nn()
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
							
								
								
									
										7
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										7
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,7 @@
 | 
				
			|||||||
 | 
					certifi==2025.4.26
 | 
				
			||||||
 | 
					charset-normalizer==3.4.2
 | 
				
			||||||
 | 
					dotenv==0.9.9
 | 
				
			||||||
 | 
					idna==3.10
 | 
				
			||||||
 | 
					python-dotenv==1.1.0
 | 
				
			||||||
 | 
					requests==2.32.3
 | 
				
			||||||
 | 
					urllib3==2.4.0
 | 
				
			||||||
							
								
								
									
										10
									
								
								sample.env
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								sample.env
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,10 @@
 | 
				
			|||||||
 | 
					CAPTCHA_AGGREGATOR_API=https://captcha.foxarmy.org/api
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#Percentage of all downloaded to be sent to test dataset
 | 
				
			||||||
 | 
					PERCENT_OF_TESTING=10
 | 
				
			||||||
 | 
					#Place to download new data
 | 
				
			||||||
 | 
					DOWNLOAD_PATH=datasets/new
 | 
				
			||||||
 | 
					#Place to save testing dataset
 | 
				
			||||||
 | 
					TESTING_PATH=datasets/testing
 | 
				
			||||||
 | 
					#Place to save training dataset
 | 
				
			||||||
 | 
					TRAINING_PATH=datasets/training
 | 
				
			||||||
		Reference in New Issue
	
	Block a user