Added script for downloading all media from subreddit using libreddit (redlib) frontend

2024-09-02 23:02:09 +03:00 · 2024-09-02 23:02:09 +03:00 · 4ecaf1aa28
parent e27b47d4ee
commit 4ecaf1aa28
1 changed files with 163 additions and 0 deletions
--- a/utilities/redditdownload.py
+++ b/utilities/redditdownload.py
@ -0,0 +1,163 @@
 import requests
 import json
 import os
 import readme
 debug = False
 def get_image_name(url):
    return url.split("/")[-1]
 def get_page_url(url, after):
    return url + "?sort=hot&t=&after=t3_" + after
 def get_full_url(libreddit_instance, url):
    return libreddit_instance + url
 def get_subreddit_full_url(libreddit_instance, subreddit):
    return libreddit_instance + "/r/" + subreddit
 def get_posts_ids_in_page(page_content):
    posts = []
    posts_html = re.findall(r"(\<div class=\"post\" id\=\".*\"\>)", page_content)
    for post_html in posts_html:
        posts.append(re.search(r"\w{6,7}", post_html).group())
    return posts
 def get_single_videos_in_page(page_content):
    videos_url = []
    videos_html = re.findall(r"(\<video class\=\"post_media_video short\"\ src\=\".*\" width\=\"\d+\" height\=\"\d+\" poster\=\"[\/\w\&\?\.\d\;\=]+\" preload\=\"\w*\" ((controls\=\"\w*\")|(controls))\s*\>)", page_content)
    for video_html in videos_html:
        v = video_html[0]
        found = re.search(r"(\/preview\/pre\/\w+\.((gif)|(webm)|(mp4)|(mov)|(mkv))\?format=((gif)|(webm)|(mp4)|(mov)|(mkv)))\&amp\;s=\w+", v).group().replace("amp;", "")
        videos_url.append(found)
    return videos_url
 def get_galleries_in_page(page_content):
    galleries = []
    galleries_html = re.findall(r"(\<a class\=\"post_thumbnail\" href\=\"\/r\/\w+\/comments\/[\w\s\d]+\/[\w\d\s]+\/\" rel\=\"[\w\s\d]*\"\s*\>)", page_content)
    for gallery_html in galleries_html:
        found = re.search(r"\/r\/\w+\/comments\/[\w\s\d]+\/[\w\d\s]+\/", gallery_html).group()
        galleries.append(found)
    return galleries
 def get_single_images_in_page(page_content):
    images_url = []
    images_html = re.findall(r"(\<a href\=\".*\" class=\"post_media_image short\"\ >)", page_content)
    for image_html in images_html:
        found = re.search(r"(\/img\/\w+\.((png)|(jpg)|(jpeg)|(mp3)|(flac)|(ogg)))", image_html).group()
        images_url.append(found)
    return images_url
 def get_all_pages(libreddit_instance, subreddit):
    pages = []
    if debug:
        subreddit = "monerochan"
        libreddit_instance = "https://redlib.kylrth.com"
    url = get_subreddit_full_url(libreddit_instance, subreddit)
    posts = get_posts_ids_in_page(requests.get(url).text)
    first_post = posts[0]
    pages.append(first_post)
    flag = True
    while flag:
        current_page = pages[-1]
        current_page_url = get_page_url(url, current_page)
        current_page_content = requests.get(current_page_url).text
        posts_in_current_page = get_posts_ids_in_page(current_page_content)
        if len(posts_in_current_page) == 0:
            flag = False
            break
        else:
            next_page = posts_in_current_page[-1]
            next_page_url = get_page_url(url, next_page)
            pages.append(next_page)
    return pages
 def find_next_button(page_content):
    next_button = re.search(r"(\<a href\=\".*\"\>NEXT\<\/a\>)", page_content)
    if next_button:
        return next_button.group()
    else:
        return None
 def get_next_page(url):
    current_page_conent = requests.get(url).text
    found = find_next_button(current_page_conent)
    if found:
        return re.search(r"\w{2}\_\w{6,7}", found).group()
    else:
        return None
 def download_gallery_content(libreddit_instance, url, destination):
    found_content = []
    gallery_content_html = requests.get(url).text
    gallery_content = re.findall(r"(\<a href\=\"\/preview\/pre\/[\w\d]+\.((png)|(jpg)|(jpeg)|(mp3)|(flac)|(ogg)|(gif)|(webm)|(mp4)|(mov)|(mkv))\?width\=\d+\&amp\;format\=\w*\&amp\;auto=\w*\&amp\;s\=\w+\" \>)", gallery_content_html)
    for post in gallery_content:
        found = re.search(r"\/preview\/pre\/[\w\d]+\.((png)|(jpg)|(jpeg)|(mp3)|(flac)|(ogg)|(gif)|(webm)|(mp4)|(mov)|(mkv))\?width\=\d+\&amp\;format\=\w*\&amp\;auto=\w*\&amp\;s\=\w+", post[0]).group().replace("amp;", "")
        download_url(get_full_url(libreddit_instance, found), destination)
 def download_url(url, destination):
    print("[I] Downloading: " + url)
    filename = get_image_name(url)
    full_path = destination + "/" + filename
    if (os.path.exists(full_path)):
        print("[I] This file exists, skipping")
        return
    data = requests.get(url).content
    with open(full_path, 'wb') as handler:
        handler.write(data)
 def main():
    subreddit = ""
    libreddit_instance = ""
    destination = ""
    while not subreddit:
        subreddit = input("Enter a subreddit name (without https://reddit.com/r/)\n>")
    if debug: print("[DEBUG] subreddit: " + subreddit)
    while not libreddit_instance:
        libreddit_instance = input("Enter a libreddit (or redlib) (https://github.com/redlib-org/redlib?tab=readme-ov-file#instances)\n>")
    if debug: print("[DEBUG] libreddit: " + libreddit_instance)
    destination = input("Enter a path to input (default is \".\")\n>")
    if not destination:
        destination = os.path.dirname(os.path.realpath(__file__))
    if debug: print("[DEBUG] output to: " + destination)
    pages = get_all_pages(libreddit_instance, subreddit)
    if not pages:
        print("Error! No pages found!")
        exit(-1)
    url = get_subreddit_full_url(libreddit_instance, subreddit)
    for page in pages:
        page_content = requests.get(get_page_url(url, page)).text
        for image_url in get_single_images_in_page(page_content):
            full_image_url = get_full_url(libreddit_instance, image_url)
            download_url(full_image_url, destination)
        for video_url in get_single_videos_in_page(page_content):
            full_video_url = get_full_url(libreddit_instance, video_url)
            download_url(full_video_url, destination)
        for gallery_url in get_galleries_in_page(page_content):
            print("Downloading from gallery: " + gallery_url)
            full_gallery_url = get_full_url(libreddit_instance, gallery_url)
            download_gallery_content(libreddit_instance, full_gallery_url, destination)
 main()