Added script for downloading all media from subreddit using libreddit (redlib) frontend
This commit is contained in:
parent
e27b47d4ee
commit
4ecaf1aa28
|
@ -0,0 +1,163 @@
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import readme
|
||||||
|
|
||||||
|
debug = False
|
||||||
|
|
||||||
|
def get_image_name(url):
|
||||||
|
return url.split("/")[-1]
|
||||||
|
|
||||||
|
def get_page_url(url, after):
|
||||||
|
return url + "?sort=hot&t=&after=t3_" + after
|
||||||
|
|
||||||
|
def get_full_url(libreddit_instance, url):
|
||||||
|
return libreddit_instance + url
|
||||||
|
|
||||||
|
def get_subreddit_full_url(libreddit_instance, subreddit):
|
||||||
|
return libreddit_instance + "/r/" + subreddit
|
||||||
|
|
||||||
|
def get_posts_ids_in_page(page_content):
|
||||||
|
posts = []
|
||||||
|
|
||||||
|
posts_html = re.findall(r"(\<div class=\"post\" id\=\".*\"\>)", page_content)
|
||||||
|
for post_html in posts_html:
|
||||||
|
posts.append(re.search(r"\w{6,7}", post_html).group())
|
||||||
|
return posts
|
||||||
|
|
||||||
|
def get_single_videos_in_page(page_content):
|
||||||
|
videos_url = []
|
||||||
|
|
||||||
|
videos_html = re.findall(r"(\<video class\=\"post_media_video short\"\ src\=\".*\" width\=\"\d+\" height\=\"\d+\" poster\=\"[\/\w\&\?\.\d\;\=]+\" preload\=\"\w*\" ((controls\=\"\w*\")|(controls))\s*\>)", page_content)
|
||||||
|
for video_html in videos_html:
|
||||||
|
v = video_html[0]
|
||||||
|
found = re.search(r"(\/preview\/pre\/\w+\.((gif)|(webm)|(mp4)|(mov)|(mkv))\?format=((gif)|(webm)|(mp4)|(mov)|(mkv)))\&\;s=\w+", v).group().replace("amp;", "")
|
||||||
|
videos_url.append(found)
|
||||||
|
return videos_url
|
||||||
|
|
||||||
|
def get_galleries_in_page(page_content):
|
||||||
|
galleries = []
|
||||||
|
|
||||||
|
galleries_html = re.findall(r"(\<a class\=\"post_thumbnail\" href\=\"\/r\/\w+\/comments\/[\w\s\d]+\/[\w\d\s]+\/\" rel\=\"[\w\s\d]*\"\s*\>)", page_content)
|
||||||
|
for gallery_html in galleries_html:
|
||||||
|
found = re.search(r"\/r\/\w+\/comments\/[\w\s\d]+\/[\w\d\s]+\/", gallery_html).group()
|
||||||
|
galleries.append(found)
|
||||||
|
return galleries
|
||||||
|
|
||||||
|
def get_single_images_in_page(page_content):
|
||||||
|
images_url = []
|
||||||
|
|
||||||
|
images_html = re.findall(r"(\<a href\=\".*\" class=\"post_media_image short\"\ >)", page_content)
|
||||||
|
for image_html in images_html:
|
||||||
|
found = re.search(r"(\/img\/\w+\.((png)|(jpg)|(jpeg)|(mp3)|(flac)|(ogg)))", image_html).group()
|
||||||
|
images_url.append(found)
|
||||||
|
return images_url
|
||||||
|
|
||||||
|
def get_all_pages(libreddit_instance, subreddit):
|
||||||
|
|
||||||
|
pages = []
|
||||||
|
|
||||||
|
if debug:
|
||||||
|
subreddit = "monerochan"
|
||||||
|
libreddit_instance = "https://redlib.kylrth.com"
|
||||||
|
|
||||||
|
url = get_subreddit_full_url(libreddit_instance, subreddit)
|
||||||
|
|
||||||
|
posts = get_posts_ids_in_page(requests.get(url).text)
|
||||||
|
|
||||||
|
first_post = posts[0]
|
||||||
|
pages.append(first_post)
|
||||||
|
flag = True
|
||||||
|
while flag:
|
||||||
|
current_page = pages[-1]
|
||||||
|
current_page_url = get_page_url(url, current_page)
|
||||||
|
current_page_content = requests.get(current_page_url).text
|
||||||
|
posts_in_current_page = get_posts_ids_in_page(current_page_content)
|
||||||
|
|
||||||
|
if len(posts_in_current_page) == 0:
|
||||||
|
flag = False
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
next_page = posts_in_current_page[-1]
|
||||||
|
next_page_url = get_page_url(url, next_page)
|
||||||
|
pages.append(next_page)
|
||||||
|
return pages
|
||||||
|
|
||||||
|
def find_next_button(page_content):
|
||||||
|
next_button = re.search(r"(\<a href\=\".*\"\>NEXT\<\/a\>)", page_content)
|
||||||
|
if next_button:
|
||||||
|
return next_button.group()
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_next_page(url):
|
||||||
|
current_page_conent = requests.get(url).text
|
||||||
|
found = find_next_button(current_page_conent)
|
||||||
|
if found:
|
||||||
|
return re.search(r"\w{2}\_\w{6,7}", found).group()
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def download_gallery_content(libreddit_instance, url, destination):
|
||||||
|
found_content = []
|
||||||
|
|
||||||
|
gallery_content_html = requests.get(url).text
|
||||||
|
gallery_content = re.findall(r"(\<a href\=\"\/preview\/pre\/[\w\d]+\.((png)|(jpg)|(jpeg)|(mp3)|(flac)|(ogg)|(gif)|(webm)|(mp4)|(mov)|(mkv))\?width\=\d+\&\;format\=\w*\&\;auto=\w*\&\;s\=\w+\" \>)", gallery_content_html)
|
||||||
|
for post in gallery_content:
|
||||||
|
found = re.search(r"\/preview\/pre\/[\w\d]+\.((png)|(jpg)|(jpeg)|(mp3)|(flac)|(ogg)|(gif)|(webm)|(mp4)|(mov)|(mkv))\?width\=\d+\&\;format\=\w*\&\;auto=\w*\&\;s\=\w+", post[0]).group().replace("amp;", "")
|
||||||
|
download_url(get_full_url(libreddit_instance, found), destination)
|
||||||
|
|
||||||
|
def download_url(url, destination):
|
||||||
|
print("[I] Downloading: " + url)
|
||||||
|
filename = get_image_name(url)
|
||||||
|
full_path = destination + "/" + filename
|
||||||
|
if (os.path.exists(full_path)):
|
||||||
|
print("[I] This file exists, skipping")
|
||||||
|
return
|
||||||
|
data = requests.get(url).content
|
||||||
|
with open(full_path, 'wb') as handler:
|
||||||
|
handler.write(data)
|
||||||
|
|
||||||
|
def main():
|
||||||
|
|
||||||
|
subreddit = ""
|
||||||
|
libreddit_instance = ""
|
||||||
|
destination = ""
|
||||||
|
|
||||||
|
while not subreddit:
|
||||||
|
subreddit = input("Enter a subreddit name (without https://reddit.com/r/)\n>")
|
||||||
|
if debug: print("[DEBUG] subreddit: " + subreddit)
|
||||||
|
|
||||||
|
while not libreddit_instance:
|
||||||
|
libreddit_instance = input("Enter a libreddit (or redlib) (https://github.com/redlib-org/redlib?tab=readme-ov-file#instances)\n>")
|
||||||
|
if debug: print("[DEBUG] libreddit: " + libreddit_instance)
|
||||||
|
|
||||||
|
destination = input("Enter a path to input (default is \".\")\n>")
|
||||||
|
if not destination:
|
||||||
|
destination = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
if debug: print("[DEBUG] output to: " + destination)
|
||||||
|
|
||||||
|
pages = get_all_pages(libreddit_instance, subreddit)
|
||||||
|
|
||||||
|
if not pages:
|
||||||
|
print("Error! No pages found!")
|
||||||
|
exit(-1)
|
||||||
|
|
||||||
|
|
||||||
|
url = get_subreddit_full_url(libreddit_instance, subreddit)
|
||||||
|
|
||||||
|
|
||||||
|
for page in pages:
|
||||||
|
page_content = requests.get(get_page_url(url, page)).text
|
||||||
|
for image_url in get_single_images_in_page(page_content):
|
||||||
|
full_image_url = get_full_url(libreddit_instance, image_url)
|
||||||
|
download_url(full_image_url, destination)
|
||||||
|
for video_url in get_single_videos_in_page(page_content):
|
||||||
|
full_video_url = get_full_url(libreddit_instance, video_url)
|
||||||
|
download_url(full_video_url, destination)
|
||||||
|
for gallery_url in get_galleries_in_page(page_content):
|
||||||
|
print("Downloading from gallery: " + gallery_url)
|
||||||
|
full_gallery_url = get_full_url(libreddit_instance, gallery_url)
|
||||||
|
download_gallery_content(libreddit_instance, full_gallery_url, destination)
|
||||||
|
|
||||||
|
main()
|
Loading…
Reference in New Issue