Update main.py

This commit is contained in:
Arma-Damna-Dillo 2025-01-16 02:31:55 +00:00
parent 6a2ff0fe72
commit 3f80caccbf

51
main.py
View file

@ -21,15 +21,15 @@ import requests
from urllib.parse import urlparse, unquote from urllib.parse import urlparse, unquote
import os import os
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import logging # import logging
import time import time
import json import json
from collections import defaultdict from collections import defaultdict
import hashlib import hashlib
# Set up logging # Set up logging
logging.basicConfig(level=logging.INFO, format='%(message)s') # logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger() # logger = logging.getLogger()
# Global counters and error tracking # Global counters and error tracking
total_urls = 0 total_urls = 0
@ -151,7 +151,10 @@ def download_media(url):
if imgur_urls: if imgur_urls:
if isinstance(imgur_urls, list): # It's an album if isinstance(imgur_urls, list): # It's an album
for imgur_url in imgur_urls: for imgur_url in imgur_urls:
try:
download_media(imgur_url) # Recursive call for each image in the album download_media(imgur_url) # Recursive call for each image in the album
except:
print("what the wow?")
return return
else: # Single image/video else: # Single image/video
direct_url = imgur_urls direct_url = imgur_urls
@ -160,7 +163,10 @@ def download_media(url):
error_summary["Imgur URL skipped"].append(url) error_summary["Imgur URL skipped"].append(url)
return return
elif 'tenor.com' in url: elif 'tenor.com' in url:
try:
gif_url = get_tenor_gif_url(url) gif_url = get_tenor_gif_url(url)
except:
gif_url = ""
if gif_url: if gif_url:
direct_url = gif_url direct_url = gif_url
else: else:
@ -208,7 +214,7 @@ def download_media(url):
file.write(response.content) file.write(response.content)
successful_downloads += 1 successful_downloads += 1
progress = (successful_downloads + failed_downloads) / total_urls * 100 progress = (successful_downloads + failed_downloads) / total_urls * 100
logger.info(f"Downloaded: {filename} ({progress:.1f}% complete)") print(f"Downloaded: {filename} ({progress:.1f}% complete)")
except requests.exceptions.RequestException as e: except requests.exceptions.RequestException as e:
failed_downloads += 1 failed_downloads += 1
if isinstance(e, requests.exceptions.HTTPError): if isinstance(e, requests.exceptions.HTTPError):
@ -227,14 +233,14 @@ def read_input_file(file_path):
if 'settings' in json_data: if 'settings' in json_data:
content = json_data['settings'] content = json_data['settings']
else: else:
logger.warning("JSON file does not contain 'settings' key. Using raw content.") print("JSON file does not contain 'settings' key. Using raw content.")
except json.JSONDecodeError: except json.JSONDecodeError:
logger.warning("Invalid JSON format. Using raw content.") print("Invalid JSON format. Using raw content.")
try: try:
decoded_content = base64.b64decode(content).decode('utf-8', errors='ignore') decoded_content = base64.b64decode(content).decode('utf-8', errors='ignore')
except (base64.binascii.Error, UnicodeDecodeError): except (base64.binascii.Error, UnicodeDecodeError):
logger.warning("Content is not valid base64 or couldn't be decoded. Using raw content.") print("Content is not valid base64 or couldn't be decoded. Using raw content.")
decoded_content = content decoded_content = content
return decoded_content return decoded_content
@ -244,7 +250,7 @@ def get_input_file():
filename = f'data.{ext}' filename = f'data.{ext}'
if os.path.exists(filename): if os.path.exists(filename):
return filename return filename
logger.error("No valid input file found. Please ensure 'data.txt' or 'data.json' exists.\nNote: If your filename is 'data.txt', only raw data from 'settings' key must be inside of it.") print("No valid input file found. Please ensure 'data.txt' or 'data.json' exists.\nNote: If your filename is 'data.txt', only raw data from 'settings' key must be inside of it.")
return None return None
def main(): def main():
@ -260,34 +266,37 @@ def main():
total_urls = len(urls) total_urls = len(urls)
for url in urls: for url in urls:
try:
download_media(url) download_media(url)
except:
print("what the wow?")
# Print statistics # Print statistics
logger.info("\n--- Download Statistics ---") print("\n--- Download Statistics ---")
logger.info(f"Total URLs processed: {total_urls}") print(f"Total URLs processed: {total_urls}")
logger.info(f"Successful downloads: {successful_downloads}") print(f"Successful downloads: {successful_downloads}")
logger.info(f"Failed downloads: {failed_downloads}") print(f"Failed downloads: {failed_downloads}")
logger.info(f"Success rate: {successful_downloads/total_urls*100:.1f}%") print(f"Success rate: {successful_downloads/total_urls*100:.1f}%")
# Print error summary # Print error summary
if error_summary: if error_summary:
logger.info("\n--- Error Summary ---") print("\n--- Error Summary ---")
for error_type, urls in error_summary.items(): for error_type, urls in error_summary.items():
logger.info(f"{error_type}: {len(urls)} occurences") print(f"{error_type}: {len(urls)} occurences")
if error_type == "HTTP 404": if error_type == "HTTP 404":
logger.info("Sample URLs (max 5):") print("Sample URLs (max 5):")
for url in urls[:5]: for url in urls[:5]:
logger.info(f" - {url}") print(f" - {url}")
elif len(urls) <= 5: elif len(urls) <= 5:
for url in urls: for url in urls:
logger.info(f" - {url}") print(f" - {url}")
else: else:
logger.info(f" (Showing first 5 of {len(urls)} errors)") print(f" (Showing first 5 of {len(urls)} errors)")
for url in urls[:5]: for url in urls[:5]:
logger.info(f" - {url}") print(f" - {url}")
# Pause for 10 seconds # Pause for 10 seconds
logger.info("\nScript finished. Exiting in 10 seconds...") print("\nScript finished. Exiting in 10 seconds...")
time.sleep(10) time.sleep(10)
if __name__ == "__main__": if __name__ == "__main__":