Added script for downloading all media from subreddit using libreddit (redlib) frontend

2024-09-02 23:02:09 +03:00 · 2024-09-02 23:02:09 +03:00 · 4ecaf1aa28
parent e27b47d4ee
commit 4ecaf1aa28
1 changed files with 163 additions and 0 deletions
--- a/utilities/redditdownload.py
+++ b/utilities/redditdownload.py
@ -0,0 +1,163 @@
+import requests
+import json
+import os
+import readme
+
+debug = False
+
+def get_image_name(url):
+    return url.split("/")[-1]
+
+def get_page_url(url, after):
+    return url + "?sort=hot&t=&after=t3_" + after
+
+def get_full_url(libreddit_instance, url):
+    return libreddit_instance + url
+
+def get_subreddit_full_url(libreddit_instance, subreddit):
+    return libreddit_instance + "/r/" + subreddit
+
+def get_posts_ids_in_page(page_content):
+    posts = []
+
+    posts_html = re.findall(r"(\<div class=\"post\" id\=\".*\"\>)", page_content)
+    for post_html in posts_html:
+        posts.append(re.search(r"\w{6,7}", post_html).group())
+    return posts
+
+def get_single_videos_in_page(page_content):
+    videos_url = []
+    
+    videos_html = re.findall(r"(\<video class\=\"post_media_video short\"\ src\=\".*\" width\=\"\d+\" height\=\"\d+\" poster\=\"[\/\w\&\?\.\d\;\=]+\" preload\=\"\w*\" ((controls\=\"\w*\")|(controls))\s*\>)", page_content)
+    for video_html in videos_html:
+        v = video_html[0]
+        found = re.search(r"(\/preview\/pre\/\w+\.((gif)|(webm)|(mp4)|(mov)|(mkv))\?format=((gif)|(webm)|(mp4)|(mov)|(mkv)))\&amp\;s=\w+", v).group().replace("amp;", "")
+        videos_url.append(found)
+    return videos_url
+
+def get_galleries_in_page(page_content):
+    galleries = []
+
+    galleries_html = re.findall(r"(\<a class\=\"post_thumbnail\" href\=\"\/r\/\w+\/comments\/[\w\s\d]+\/[\w\d\s]+\/\" rel\=\"[\w\s\d]*\"\s*\>)", page_content)
+    for gallery_html in galleries_html:
+        found = re.search(r"\/r\/\w+\/comments\/[\w\s\d]+\/[\w\d\s]+\/", gallery_html).group()
+        galleries.append(found)
+    return galleries
+
+def get_single_images_in_page(page_content):
+    images_url = []
+
+    images_html = re.findall(r"(\<a href\=\".*\" class=\"post_media_image short\"\ >)", page_content)
+    for image_html in images_html:
+        found = re.search(r"(\/img\/\w+\.((png)|(jpg)|(jpeg)|(mp3)|(flac)|(ogg)))", image_html).group()
+        images_url.append(found)
+    return images_url
+
+def get_all_pages(libreddit_instance, subreddit):
+
+    pages = []
+
+    if debug:
+        subreddit = "monerochan"
+        libreddit_instance = "https://redlib.kylrth.com"
+
+    url = get_subreddit_full_url(libreddit_instance, subreddit)
+
+    posts = get_posts_ids_in_page(requests.get(url).text)
+
+    first_post = posts[0]
+    pages.append(first_post)
+    flag = True
+    while flag:
+        current_page = pages[-1]
+        current_page_url = get_page_url(url, current_page)
+        current_page_content = requests.get(current_page_url).text
+        posts_in_current_page = get_posts_ids_in_page(current_page_content)
+
+        if len(posts_in_current_page) == 0:
+            flag = False
+            break
+        else:
+            next_page = posts_in_current_page[-1]
+            next_page_url = get_page_url(url, next_page)
+            pages.append(next_page)
+    return pages
+
+def find_next_button(page_content):
+    next_button = re.search(r"(\<a href\=\".*\"\>NEXT\<\/a\>)", page_content)
+    if next_button:
+        return next_button.group()
+    else:
+        return None
+
+def get_next_page(url):
+    current_page_conent = requests.get(url).text
+    found = find_next_button(current_page_conent)
+    if found:
+        return re.search(r"\w{2}\_\w{6,7}", found).group()
+    else:
+        return None
+
+def download_gallery_content(libreddit_instance, url, destination):
+    found_content = []
+
+    gallery_content_html = requests.get(url).text
+    gallery_content = re.findall(r"(\<a href\=\"\/preview\/pre\/[\w\d]+\.((png)|(jpg)|(jpeg)|(mp3)|(flac)|(ogg)|(gif)|(webm)|(mp4)|(mov)|(mkv))\?width\=\d+\&amp\;format\=\w*\&amp\;auto=\w*\&amp\;s\=\w+\" \>)", gallery_content_html)
+    for post in gallery_content:
+        found = re.search(r"\/preview\/pre\/[\w\d]+\.((png)|(jpg)|(jpeg)|(mp3)|(flac)|(ogg)|(gif)|(webm)|(mp4)|(mov)|(mkv))\?width\=\d+\&amp\;format\=\w*\&amp\;auto=\w*\&amp\;s\=\w+", post[0]).group().replace("amp;", "")
+        download_url(get_full_url(libreddit_instance, found), destination)
+
+def download_url(url, destination):
+    print("[I] Downloading: " + url)
+    filename = get_image_name(url)
+    full_path = destination + "/" + filename
+    if (os.path.exists(full_path)):
+        print("[I] This file exists, skipping")
+        return
+    data = requests.get(url).content
+    with open(full_path, 'wb') as handler:
+        handler.write(data)
+
+def main():
+    
+    subreddit = ""
+    libreddit_instance = ""
+    destination = ""
+
+    while not subreddit:
+        subreddit = input("Enter a subreddit name (without https://reddit.com/r/)\n>")
+    if debug: print("[DEBUG] subreddit: " + subreddit)
+
+    while not libreddit_instance:
+        libreddit_instance = input("Enter a libreddit (or redlib) (https://github.com/redlib-org/redlib?tab=readme-ov-file#instances)\n>")
+    if debug: print("[DEBUG] libreddit: " + libreddit_instance)
+
+    destination = input("Enter a path to input (default is \".\")\n>")
+    if not destination:
+        destination = os.path.dirname(os.path.realpath(__file__))
+    if debug: print("[DEBUG] output to: " + destination)
+        
+    pages = get_all_pages(libreddit_instance, subreddit)
+
+    if not pages:
+        print("Error! No pages found!")
+        exit(-1)
+        
+
+    url = get_subreddit_full_url(libreddit_instance, subreddit)
+
+
+    for page in pages:
+        page_content = requests.get(get_page_url(url, page)).text
+        for image_url in get_single_images_in_page(page_content):
+            full_image_url = get_full_url(libreddit_instance, image_url)
+            download_url(full_image_url, destination)
+        for video_url in get_single_videos_in_page(page_content):
+            full_video_url = get_full_url(libreddit_instance, video_url)
+            download_url(full_video_url, destination)
+        for gallery_url in get_galleries_in_page(page_content):
+            print("Downloading from gallery: " + gallery_url)
+            full_gallery_url = get_full_url(libreddit_instance, gallery_url)
+            download_gallery_content(libreddit_instance, full_gallery_url, destination)
+
+main()