Update main.py

This commit is contained in:
Arma-Damna-Dillo 2025-01-16 02:31:55 +00:00
parent 6a2ff0fe72
commit 3f80caccbf

597
main.py
View file

@ -1,294 +1,303 @@
""" """
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
Author: nloginov Author: nloginov
Script Name: Discord Favorite Gif Downloader Script Name: Discord Favorite Gif Downloader
""" """
import base64 import base64
import re import re
import requests import requests
from urllib.parse import urlparse, unquote from urllib.parse import urlparse, unquote
import os import os
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import logging # import logging
import time import time
import json import json
from collections import defaultdict from collections import defaultdict
import hashlib import hashlib
# Set up logging # Set up logging
logging.basicConfig(level=logging.INFO, format='%(message)s') # logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger() # logger = logging.getLogger()
# Global counters and error tracking # Global counters and error tracking
total_urls = 0 total_urls = 0
successful_downloads = 0 successful_downloads = 0
failed_downloads = 0 failed_downloads = 0
error_summary = defaultdict(list) error_summary = defaultdict(list)
def ensure_directory(directory): def ensure_directory(directory):
if not os.path.exists(directory): if not os.path.exists(directory):
os.makedirs(directory) os.makedirs(directory)
def extract_and_fix_urls(text): def extract_and_fix_urls(text):
pattern = r'https?:?/+[a-zA-Z0-9\-._~:/?#[\]@!$&\'()*+,;=%]+' pattern = r'https?:?/+[a-zA-Z0-9\-._~:/?#[\]@!$&\'()*+,;=%]+'
urls = re.findall(pattern, text) urls = re.findall(pattern, text)
fixed_urls = [] fixed_urls = []
for url in urls: for url in urls:
if url.startswith('http/'): if url.startswith('http/'):
url = 'http://' + url[5:] url = 'http://' + url[5:]
elif url.startswith('https/'): elif url.startswith('https/'):
url = 'https://' + url[6:] url = 'https://' + url[6:]
url = re.sub(r'^(https?:)/+', r'\1//', url) url = re.sub(r'^(https?:)/+', r'\1//', url)
if 'discordapp.net/external/' in url: if 'discordapp.net/external/' in url:
parsed = urlparse(url) parsed = urlparse(url)
query = parsed.path.split('/')[-1] query = parsed.path.split('/')[-1]
if query.startswith('http'): if query.startswith('http'):
url = unquote(query) url = unquote(query)
fixed_urls.append(url) fixed_urls.append(url)
return fixed_urls return fixed_urls
def get_imgur_url(imgur_url): def get_imgur_url(imgur_url):
try: try:
# Format for Imgur images # Format for Imgur images
fmt_url = r'https://i\.imgur\.com/\w+\.(?:jpg|png|gif|mp4)' fmt_url = r'https://i\.imgur\.com/\w+\.(?:jpg|png|gif|mp4)'
# Handle album URLs # Handle album URLs
if '/a/' in imgur_url: if '/a/' in imgur_url:
response = requests.get(imgur_url, timeout=10) response = requests.get(imgur_url, timeout=10)
response.raise_for_status() response.raise_for_status()
# Extract all image URLs from the album page # Extract all image URLs from the album page
image_urls = re.findall(fmt_url, response.text) image_urls = re.findall(fmt_url, response.text)
return image_urls if image_urls else None return image_urls if image_urls else None
# Handle single image/video URLs # Handle single image/video URLs
response = requests.get(imgur_url, timeout=10) response = requests.get(imgur_url, timeout=10)
response.raise_for_status() response.raise_for_status()
content = response.text content = response.text
# Try to find direct image/video URL in the page source # Try to find direct image/video URL in the page source
match = re.search(fmt_url, content) match = re.search(fmt_url, content)
if match: if match:
return match.group(0) return match.group(0)
# If direct URL not found, construct it from the imgur ID # If direct URL not found, construct it from the imgur ID
imgur_id = imgur_url.split('/')[-1] imgur_id = imgur_url.split('/')[-1]
return f'https://i.imgur.com/{imgur_id}.jpg' return f'https://i.imgur.com/{imgur_id}.jpg'
except requests.exceptions.RequestException: except requests.exceptions.RequestException:
pass # Silently handle the error pass # Silently handle the error
return None return None
def get_tenor_gif_url(tenor_url): def get_tenor_gif_url(tenor_url):
try: try:
response = requests.get(tenor_url, timeout=10) response = requests.get(tenor_url, timeout=10)
response.raise_for_status() response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser') soup = BeautifulSoup(response.text, 'html.parser')
gif_element = soup.select_one('div.Gif img') gif_element = soup.select_one('div.Gif img')
if gif_element and 'src' in gif_element.attrs: if gif_element and 'src' in gif_element.attrs:
return gif_element['src'] return gif_element['src']
meta_content_url = soup.select_one('meta[itemprop="contentUrl"]') meta_content_url = soup.select_one('meta[itemprop="contentUrl"]')
if meta_content_url and 'content' in meta_content_url.attrs: if meta_content_url and 'content' in meta_content_url.attrs:
return meta_content_url['content'] return meta_content_url['content']
except requests.exceptions.RequestException: except requests.exceptions.RequestException:
pass # Silently handle the error pass # Silently handle the error
return None return None
CONTENT_TYPES = { CONTENT_TYPES = {
'image/gif': ('.gif', 'gif'), 'image/gif': ('.gif', 'gif'),
'video/mp4': ('.mp4', 'mp4'), 'video/mp4': ('.mp4', 'mp4'),
'image/png': ('.png', 'img'), 'image/png': ('.png', 'img'),
'image/jpeg': ('.jpg', 'img'), 'image/jpeg': ('.jpg', 'img'),
'video/webm': ('.webm', 'webm'), 'video/webm': ('.webm', 'webm'),
'image/webp': ('.webp', 'img') 'image/webp': ('.webp', 'img')
} }
SUPPORTED_EXTENSIONS = tuple(ext for ext, _ in CONTENT_TYPES.values()) SUPPORTED_EXTENSIONS = tuple(ext for ext, _ in CONTENT_TYPES.values())
def get_extension_and_subfolder(content_type, direct_url): def get_extension_and_subfolder(content_type, direct_url):
for mime, (ext, subfolder) in CONTENT_TYPES.items(): for mime, (ext, subfolder) in CONTENT_TYPES.items():
if mime in content_type or direct_url.lower().endswith(ext): if mime in content_type or direct_url.lower().endswith(ext):
return ext, subfolder return ext, subfolder
return None, None return None, None
def safe_filename(filename, max_length=200): def safe_filename(filename, max_length=200):
# Remove invalid characters # Remove invalid characters
filename = re.sub(r'[<>:"/\\|?*]', '', filename) filename = re.sub(r'[<>:"/\\|?*]', '', filename)
# Truncate if too long, but keep the extension # Truncate if too long, but keep the extension
name, ext = os.path.splitext(filename) name, ext = os.path.splitext(filename)
if len(name) > max_length: if len(name) > max_length:
# Use a hash of the full name to ensure uniqueness # Use a hash of the full name to ensure uniqueness
name_hash = hashlib.md5(name.encode()).hexdigest()[:8] name_hash = hashlib.md5(name.encode()).hexdigest()[:8]
name = name[:max_length-9] + '_' + name_hash name = name[:max_length-9] + '_' + name_hash
return name + ext return name + ext
def download_media(url): def download_media(url):
global successful_downloads, failed_downloads global successful_downloads, failed_downloads
try: try:
if 'imgur.com' in url: if 'imgur.com' in url:
imgur_urls = get_imgur_url(url) imgur_urls = get_imgur_url(url)
if imgur_urls: if imgur_urls:
if isinstance(imgur_urls, list): # It's an album if isinstance(imgur_urls, list): # It's an album
for imgur_url in imgur_urls: for imgur_url in imgur_urls:
download_media(imgur_url) # Recursive call for each image in the album try:
return download_media(imgur_url) # Recursive call for each image in the album
else: # Single image/video except:
direct_url = imgur_urls print("what the wow?")
else: return
failed_downloads += 1 else: # Single image/video
error_summary["Imgur URL skipped"].append(url) direct_url = imgur_urls
return else:
elif 'tenor.com' in url: failed_downloads += 1
gif_url = get_tenor_gif_url(url) error_summary["Imgur URL skipped"].append(url)
if gif_url: return
direct_url = gif_url elif 'tenor.com' in url:
else: try:
failed_downloads += 1 gif_url = get_tenor_gif_url(url)
error_summary["Tenor URL skipped"].append(url) except:
return gif_url = ""
elif url.lower().endswith(SUPPORTED_EXTENSIONS): if gif_url:
direct_url = url direct_url = gif_url
else: else:
direct_url = url failed_downloads += 1
error_summary["Tenor URL skipped"].append(url)
response = requests.get(direct_url, timeout=10, allow_redirects=True) return
response.raise_for_status() elif url.lower().endswith(SUPPORTED_EXTENSIONS):
direct_url = url
content_type = response.headers.get('Content-Type', '').lower() else:
direct_url = url
extension, subfolder = get_extension_and_subfolder(content_type, direct_url)
if not extension: response = requests.get(direct_url, timeout=10, allow_redirects=True)
failed_downloads += 1 response.raise_for_status()
error_summary["Unsupported content type"].append(f"{content_type} - {direct_url}")
return content_type = response.headers.get('Content-Type', '').lower()
parsed_url = urlparse(unquote(direct_url)) extension, subfolder = get_extension_and_subfolder(content_type, direct_url)
filename = os.path.basename(parsed_url.path) if not extension:
filename, _ = os.path.splitext(filename) failed_downloads += 1
error_summary["Unsupported content type"].append(f"{content_type} - {direct_url}")
if not filename or filename == extension: return
path_parts = parsed_url.path.rstrip('/').split('/')
filename = path_parts[-1] if path_parts else 'unnamed' parsed_url = urlparse(unquote(direct_url))
filename = os.path.basename(parsed_url.path)
filename = safe_filename(filename + extension) filename, _ = os.path.splitext(filename)
download_dir = os.path.join('downloaded', subfolder) if not filename or filename == extension:
ensure_directory(download_dir) path_parts = parsed_url.path.rstrip('/').split('/')
filename = path_parts[-1] if path_parts else 'unnamed'
counter = 1
original_filename = filename filename = safe_filename(filename + extension)
while os.path.exists(os.path.join(download_dir, filename)):
name, ext = os.path.splitext(original_filename) download_dir = os.path.join('downloaded', subfolder)
filename = f"{name}_{counter}{ext}" ensure_directory(download_dir)
counter += 1
counter = 1
full_path = os.path.join(download_dir, filename) original_filename = filename
with open(full_path, 'wb') as file: while os.path.exists(os.path.join(download_dir, filename)):
file.write(response.content) name, ext = os.path.splitext(original_filename)
successful_downloads += 1 filename = f"{name}_{counter}{ext}"
progress = (successful_downloads + failed_downloads) / total_urls * 100 counter += 1
logger.info(f"Downloaded: {filename} ({progress:.1f}% complete)")
except requests.exceptions.RequestException as e: full_path = os.path.join(download_dir, filename)
failed_downloads += 1 with open(full_path, 'wb') as file:
if isinstance(e, requests.exceptions.HTTPError): file.write(response.content)
error_summary[f"HTTP {e.response.status_code}"].append(url) successful_downloads += 1
else: progress = (successful_downloads + failed_downloads) / total_urls * 100
error_summary["Other errors"].append(f"{url} - {str(e)}") print(f"Downloaded: {filename} ({progress:.1f}% complete)")
except requests.exceptions.RequestException as e:
failed_downloads += 1
def read_input_file(file_path): if isinstance(e, requests.exceptions.HTTPError):
with open(file_path, 'r', encoding='utf-8') as file: error_summary[f"HTTP {e.response.status_code}"].append(url)
content = file.read() else:
error_summary["Other errors"].append(f"{url} - {str(e)}")
if file_path.lower().endswith('.json'):
try:
json_data = json.loads(content) def read_input_file(file_path):
if 'settings' in json_data: with open(file_path, 'r', encoding='utf-8') as file:
content = json_data['settings'] content = file.read()
else:
logger.warning("JSON file does not contain 'settings' key. Using raw content.") if file_path.lower().endswith('.json'):
except json.JSONDecodeError: try:
logger.warning("Invalid JSON format. Using raw content.") json_data = json.loads(content)
if 'settings' in json_data:
try: content = json_data['settings']
decoded_content = base64.b64decode(content).decode('utf-8', errors='ignore') else:
except (base64.binascii.Error, UnicodeDecodeError): print("JSON file does not contain 'settings' key. Using raw content.")
logger.warning("Content is not valid base64 or couldn't be decoded. Using raw content.") except json.JSONDecodeError:
decoded_content = content print("Invalid JSON format. Using raw content.")
return decoded_content try:
decoded_content = base64.b64decode(content).decode('utf-8', errors='ignore')
def get_input_file(): except (base64.binascii.Error, UnicodeDecodeError):
for ext in ['txt', 'json']: print("Content is not valid base64 or couldn't be decoded. Using raw content.")
filename = f'data.{ext}' decoded_content = content
if os.path.exists(filename):
return filename return decoded_content
logger.error("No valid input file found. Please ensure 'data.txt' or 'data.json' exists.\nNote: If your filename is 'data.txt', only raw data from 'settings' key must be inside of it.")
return None def get_input_file():
for ext in ['txt', 'json']:
def main(): filename = f'data.{ext}'
global total_urls if os.path.exists(filename):
return filename
input_file = get_input_file() print("No valid input file found. Please ensure 'data.txt' or 'data.json' exists.\nNote: If your filename is 'data.txt', only raw data from 'settings' key must be inside of it.")
if not input_file: return None
return
def main():
decoded_content = read_input_file(input_file) global total_urls
urls = extract_and_fix_urls(decoded_content) input_file = get_input_file()
total_urls = len(urls) if not input_file:
return
for url in urls:
download_media(url) decoded_content = read_input_file(input_file)
# Print statistics urls = extract_and_fix_urls(decoded_content)
logger.info("\n--- Download Statistics ---") total_urls = len(urls)
logger.info(f"Total URLs processed: {total_urls}")
logger.info(f"Successful downloads: {successful_downloads}") for url in urls:
logger.info(f"Failed downloads: {failed_downloads}") try:
logger.info(f"Success rate: {successful_downloads/total_urls*100:.1f}%") download_media(url)
except:
# Print error summary print("what the wow?")
if error_summary:
logger.info("\n--- Error Summary ---") # Print statistics
for error_type, urls in error_summary.items(): print("\n--- Download Statistics ---")
logger.info(f"{error_type}: {len(urls)} occurences") print(f"Total URLs processed: {total_urls}")
if error_type == "HTTP 404": print(f"Successful downloads: {successful_downloads}")
logger.info("Sample URLs (max 5):") print(f"Failed downloads: {failed_downloads}")
for url in urls[:5]: print(f"Success rate: {successful_downloads/total_urls*100:.1f}%")
logger.info(f" - {url}")
elif len(urls) <= 5: # Print error summary
for url in urls: if error_summary:
logger.info(f" - {url}") print("\n--- Error Summary ---")
else: for error_type, urls in error_summary.items():
logger.info(f" (Showing first 5 of {len(urls)} errors)") print(f"{error_type}: {len(urls)} occurences")
for url in urls[:5]: if error_type == "HTTP 404":
logger.info(f" - {url}") print("Sample URLs (max 5):")
for url in urls[:5]:
# Pause for 10 seconds print(f" - {url}")
logger.info("\nScript finished. Exiting in 10 seconds...") elif len(urls) <= 5:
time.sleep(10) for url in urls:
print(f" - {url}")
if __name__ == "__main__": else:
main() print(f" (Showing first 5 of {len(urls)} errors)")
for url in urls[:5]:
print(f" - {url}")
# Pause for 10 seconds
print("\nScript finished. Exiting in 10 seconds...")
time.sleep(10)
if __name__ == "__main__":
main()