scripts/utilities/redditdownload.py

163 lines
5.9 KiB
Python

import requests
import json
import os
import readme
debug = False
def get_image_name(url):
return url.split("/")[-1]
def get_page_url(url, after):
return url + "?sort=hot&t=&after=t3_" + after
def get_full_url(libreddit_instance, url):
return libreddit_instance + url
def get_subreddit_full_url(libreddit_instance, subreddit):
return libreddit_instance + "/r/" + subreddit
def get_posts_ids_in_page(page_content):
posts = []
posts_html = re.findall(r"(\<div class=\"post\" id\=\".*\"\>)", page_content)
for post_html in posts_html:
posts.append(re.search(r"\w{6,7}", post_html).group())
return posts
def get_single_videos_in_page(page_content):
videos_url = []
videos_html = re.findall(r"(\<video class\=\"post_media_video short\"\ src\=\".*\" width\=\"\d+\" height\=\"\d+\" poster\=\"[\/\w\&\?\.\d\;\=]+\" preload\=\"\w*\" ((controls\=\"\w*\")|(controls))\s*\>)", page_content)
for video_html in videos_html:
v = video_html[0]
found = re.search(r"(\/preview\/pre\/\w+\.((gif)|(webm)|(mp4)|(mov)|(mkv))\?format=((gif)|(webm)|(mp4)|(mov)|(mkv)))\&amp\;s=\w+", v).group().replace("amp;", "")
videos_url.append(found)
return videos_url
def get_galleries_in_page(page_content):
galleries = []
galleries_html = re.findall(r"(\<a class\=\"post_thumbnail\" href\=\"\/r\/\w+\/comments\/[\w\s\d]+\/[\w\d\s]+\/\" rel\=\"[\w\s\d]*\"\s*\>)", page_content)
for gallery_html in galleries_html:
found = re.search(r"\/r\/\w+\/comments\/[\w\s\d]+\/[\w\d\s]+\/", gallery_html).group()
galleries.append(found)
return galleries
def get_single_images_in_page(page_content):
images_url = []
images_html = re.findall(r"(\<a href\=\".*\" class=\"post_media_image short\"\ >)", page_content)
for image_html in images_html:
found = re.search(r"(\/img\/\w+\.((png)|(jpg)|(jpeg)|(mp3)|(flac)|(ogg)))", image_html).group()
images_url.append(found)
return images_url
def get_all_pages(libreddit_instance, subreddit):
pages = []
if debug:
subreddit = "monerochan"
libreddit_instance = "https://redlib.kylrth.com"
url = get_subreddit_full_url(libreddit_instance, subreddit)
posts = get_posts_ids_in_page(requests.get(url).text)
first_post = posts[0]
pages.append(first_post)
flag = True
while flag:
current_page = pages[-1]
current_page_url = get_page_url(url, current_page)
current_page_content = requests.get(current_page_url).text
posts_in_current_page = get_posts_ids_in_page(current_page_content)
if len(posts_in_current_page) == 0:
flag = False
break
else:
next_page = posts_in_current_page[-1]
next_page_url = get_page_url(url, next_page)
pages.append(next_page)
return pages
def find_next_button(page_content):
next_button = re.search(r"(\<a href\=\".*\"\>NEXT\<\/a\>)", page_content)
if next_button:
return next_button.group()
else:
return None
def get_next_page(url):
current_page_conent = requests.get(url).text
found = find_next_button(current_page_conent)
if found:
return re.search(r"\w{2}\_\w{6,7}", found).group()
else:
return None
def download_gallery_content(libreddit_instance, url, destination):
found_content = []
gallery_content_html = requests.get(url).text
gallery_content = re.findall(r"(\<a href\=\"\/preview\/pre\/[\w\d]+\.((png)|(jpg)|(jpeg)|(mp3)|(flac)|(ogg)|(gif)|(webm)|(mp4)|(mov)|(mkv))\?width\=\d+\&amp\;format\=\w*\&amp\;auto=\w*\&amp\;s\=\w+\" \>)", gallery_content_html)
for post in gallery_content:
found = re.search(r"\/preview\/pre\/[\w\d]+\.((png)|(jpg)|(jpeg)|(mp3)|(flac)|(ogg)|(gif)|(webm)|(mp4)|(mov)|(mkv))\?width\=\d+\&amp\;format\=\w*\&amp\;auto=\w*\&amp\;s\=\w+", post[0]).group().replace("amp;", "")
download_url(get_full_url(libreddit_instance, found), destination)
def download_url(url, destination):
print("[I] Downloading: " + url)
filename = get_image_name(url)
full_path = destination + "/" + filename
if (os.path.exists(full_path)):
print("[I] This file exists, skipping")
return
data = requests.get(url).content
with open(full_path, 'wb') as handler:
handler.write(data)
def main():
subreddit = ""
libreddit_instance = ""
destination = ""
while not subreddit:
subreddit = input("Enter a subreddit name (without https://reddit.com/r/)\n>")
if debug: print("[DEBUG] subreddit: " + subreddit)
while not libreddit_instance:
libreddit_instance = input("Enter a libreddit (or redlib) (https://github.com/redlib-org/redlib?tab=readme-ov-file#instances)\n>")
if debug: print("[DEBUG] libreddit: " + libreddit_instance)
destination = input("Enter a path to input (default is \".\")\n>")
if not destination:
destination = os.path.dirname(os.path.realpath(__file__))
if debug: print("[DEBUG] output to: " + destination)
pages = get_all_pages(libreddit_instance, subreddit)
if not pages:
print("Error! No pages found!")
exit(-1)
url = get_subreddit_full_url(libreddit_instance, subreddit)
for page in pages:
page_content = requests.get(get_page_url(url, page)).text
for image_url in get_single_images_in_page(page_content):
full_image_url = get_full_url(libreddit_instance, image_url)
download_url(full_image_url, destination)
for video_url in get_single_videos_in_page(page_content):
full_video_url = get_full_url(libreddit_instance, video_url)
download_url(full_video_url, destination)
for gallery_url in get_galleries_in_page(page_content):
print("Downloading from gallery: " + gallery_url)
full_gallery_url = get_full_url(libreddit_instance, gallery_url)
download_gallery_content(libreddit_instance, full_gallery_url, destination)
main()