Update main.py
This commit is contained in:
parent
6a2ff0fe72
commit
3f80caccbf
1 changed files with 303 additions and 294 deletions
57
main.py
57
main.py
|
@ -21,15 +21,15 @@ import requests
|
||||||
from urllib.parse import urlparse, unquote
|
from urllib.parse import urlparse, unquote
|
||||||
import os
|
import os
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import logging
|
# import logging
|
||||||
import time
|
import time
|
||||||
import json
|
import json
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import hashlib
|
import hashlib
|
||||||
|
|
||||||
# Set up logging
|
# Set up logging
|
||||||
logging.basicConfig(level=logging.INFO, format='%(message)s')
|
# logging.basicConfig(level=logging.INFO, format='%(message)s')
|
||||||
logger = logging.getLogger()
|
# logger = logging.getLogger()
|
||||||
|
|
||||||
# Global counters and error tracking
|
# Global counters and error tracking
|
||||||
total_urls = 0
|
total_urls = 0
|
||||||
|
@ -151,7 +151,10 @@ def download_media(url):
|
||||||
if imgur_urls:
|
if imgur_urls:
|
||||||
if isinstance(imgur_urls, list): # It's an album
|
if isinstance(imgur_urls, list): # It's an album
|
||||||
for imgur_url in imgur_urls:
|
for imgur_url in imgur_urls:
|
||||||
download_media(imgur_url) # Recursive call for each image in the album
|
try:
|
||||||
|
download_media(imgur_url) # Recursive call for each image in the album
|
||||||
|
except:
|
||||||
|
print("what the wow?")
|
||||||
return
|
return
|
||||||
else: # Single image/video
|
else: # Single image/video
|
||||||
direct_url = imgur_urls
|
direct_url = imgur_urls
|
||||||
|
@ -160,7 +163,10 @@ def download_media(url):
|
||||||
error_summary["Imgur URL skipped"].append(url)
|
error_summary["Imgur URL skipped"].append(url)
|
||||||
return
|
return
|
||||||
elif 'tenor.com' in url:
|
elif 'tenor.com' in url:
|
||||||
gif_url = get_tenor_gif_url(url)
|
try:
|
||||||
|
gif_url = get_tenor_gif_url(url)
|
||||||
|
except:
|
||||||
|
gif_url = ""
|
||||||
if gif_url:
|
if gif_url:
|
||||||
direct_url = gif_url
|
direct_url = gif_url
|
||||||
else:
|
else:
|
||||||
|
@ -208,7 +214,7 @@ def download_media(url):
|
||||||
file.write(response.content)
|
file.write(response.content)
|
||||||
successful_downloads += 1
|
successful_downloads += 1
|
||||||
progress = (successful_downloads + failed_downloads) / total_urls * 100
|
progress = (successful_downloads + failed_downloads) / total_urls * 100
|
||||||
logger.info(f"Downloaded: {filename} ({progress:.1f}% complete)")
|
print(f"Downloaded: {filename} ({progress:.1f}% complete)")
|
||||||
except requests.exceptions.RequestException as e:
|
except requests.exceptions.RequestException as e:
|
||||||
failed_downloads += 1
|
failed_downloads += 1
|
||||||
if isinstance(e, requests.exceptions.HTTPError):
|
if isinstance(e, requests.exceptions.HTTPError):
|
||||||
|
@ -227,14 +233,14 @@ def read_input_file(file_path):
|
||||||
if 'settings' in json_data:
|
if 'settings' in json_data:
|
||||||
content = json_data['settings']
|
content = json_data['settings']
|
||||||
else:
|
else:
|
||||||
logger.warning("JSON file does not contain 'settings' key. Using raw content.")
|
print("JSON file does not contain 'settings' key. Using raw content.")
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
logger.warning("Invalid JSON format. Using raw content.")
|
print("Invalid JSON format. Using raw content.")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
decoded_content = base64.b64decode(content).decode('utf-8', errors='ignore')
|
decoded_content = base64.b64decode(content).decode('utf-8', errors='ignore')
|
||||||
except (base64.binascii.Error, UnicodeDecodeError):
|
except (base64.binascii.Error, UnicodeDecodeError):
|
||||||
logger.warning("Content is not valid base64 or couldn't be decoded. Using raw content.")
|
print("Content is not valid base64 or couldn't be decoded. Using raw content.")
|
||||||
decoded_content = content
|
decoded_content = content
|
||||||
|
|
||||||
return decoded_content
|
return decoded_content
|
||||||
|
@ -244,7 +250,7 @@ def get_input_file():
|
||||||
filename = f'data.{ext}'
|
filename = f'data.{ext}'
|
||||||
if os.path.exists(filename):
|
if os.path.exists(filename):
|
||||||
return filename
|
return filename
|
||||||
logger.error("No valid input file found. Please ensure 'data.txt' or 'data.json' exists.\nNote: If your filename is 'data.txt', only raw data from 'settings' key must be inside of it.")
|
print("No valid input file found. Please ensure 'data.txt' or 'data.json' exists.\nNote: If your filename is 'data.txt', only raw data from 'settings' key must be inside of it.")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
@ -260,34 +266,37 @@ def main():
|
||||||
total_urls = len(urls)
|
total_urls = len(urls)
|
||||||
|
|
||||||
for url in urls:
|
for url in urls:
|
||||||
download_media(url)
|
try:
|
||||||
|
download_media(url)
|
||||||
|
except:
|
||||||
|
print("what the wow?")
|
||||||
|
|
||||||
# Print statistics
|
# Print statistics
|
||||||
logger.info("\n--- Download Statistics ---")
|
print("\n--- Download Statistics ---")
|
||||||
logger.info(f"Total URLs processed: {total_urls}")
|
print(f"Total URLs processed: {total_urls}")
|
||||||
logger.info(f"Successful downloads: {successful_downloads}")
|
print(f"Successful downloads: {successful_downloads}")
|
||||||
logger.info(f"Failed downloads: {failed_downloads}")
|
print(f"Failed downloads: {failed_downloads}")
|
||||||
logger.info(f"Success rate: {successful_downloads/total_urls*100:.1f}%")
|
print(f"Success rate: {successful_downloads/total_urls*100:.1f}%")
|
||||||
|
|
||||||
# Print error summary
|
# Print error summary
|
||||||
if error_summary:
|
if error_summary:
|
||||||
logger.info("\n--- Error Summary ---")
|
print("\n--- Error Summary ---")
|
||||||
for error_type, urls in error_summary.items():
|
for error_type, urls in error_summary.items():
|
||||||
logger.info(f"{error_type}: {len(urls)} occurences")
|
print(f"{error_type}: {len(urls)} occurences")
|
||||||
if error_type == "HTTP 404":
|
if error_type == "HTTP 404":
|
||||||
logger.info("Sample URLs (max 5):")
|
print("Sample URLs (max 5):")
|
||||||
for url in urls[:5]:
|
for url in urls[:5]:
|
||||||
logger.info(f" - {url}")
|
print(f" - {url}")
|
||||||
elif len(urls) <= 5:
|
elif len(urls) <= 5:
|
||||||
for url in urls:
|
for url in urls:
|
||||||
logger.info(f" - {url}")
|
print(f" - {url}")
|
||||||
else:
|
else:
|
||||||
logger.info(f" (Showing first 5 of {len(urls)} errors)")
|
print(f" (Showing first 5 of {len(urls)} errors)")
|
||||||
for url in urls[:5]:
|
for url in urls[:5]:
|
||||||
logger.info(f" - {url}")
|
print(f" - {url}")
|
||||||
|
|
||||||
# Pause for 10 seconds
|
# Pause for 10 seconds
|
||||||
logger.info("\nScript finished. Exiting in 10 seconds...")
|
print("\nScript finished. Exiting in 10 seconds...")
|
||||||
time.sleep(10)
|
time.sleep(10)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
Loading…
Reference in a new issue