Update main.py

2024-07-06 11:27:59 +02:00 · 2024-07-06 11:27:59 +02:00 · 528ea0848f
commit 528ea0848f
parent 8d355a3659
1 changed files with 113 additions and 41 deletions
--- a/main.py
+++ b/main.py
@ -1,17 +1,19 @@
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+"""
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at

-# Author: nloginov
-# Script Name: Discord Favorite Gif Downloader
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Author: nloginov
+Script Name: Discord Favorite Gif Downloader
+"""

 import base64
 import re
@ -21,15 +23,19 @@ import os
 from bs4 import BeautifulSoup
 import logging
 import time
+import json
+from collections import defaultdict
+import hashlib

 # Set up logging
 logging.basicConfig(level=logging.INFO, format='%(message)s')
 logger = logging.getLogger()

-# Global counters
+# Global counters and error tracking
 total_urls = 0
 successful_downloads = 0
 failed_downloads = 0
+error_summary = defaultdict(list)

 def ensure_directory(directory):
    if not os.path.exists(directory):
@ -76,18 +82,48 @@ def get_tenor_gif_url(tenor_url):
        pass  # Silently handle the error
    return None

+CONTENT_TYPES = {
+    'image/gif': ('.gif', 'gif'),
+    'video/mp4': ('.mp4', 'mp4'),
+    'image/png': ('.png', 'img'),
+    'image/jpeg': ('.jpg', 'img'),
+    'video/webm': ('.webm', 'webm'),
+    'image/webp': ('.webp', 'img')
+}
+
+SUPPORTED_EXTENSIONS = tuple(ext for ext, _ in CONTENT_TYPES.values())
+
+def get_extension_and_subfolder(content_type, direct_url):
+    for mime, (ext, subfolder) in CONTENT_TYPES.items():
+        if mime in content_type or direct_url.lower().endswith(ext):
+            return ext, subfolder
+    return None, None
+
+def safe_filename(filename, max_length=200):
+    # Remove invalid characters
+    filename = re.sub(r'[<>:"/\\|?*]', '', filename)
+    
+    # Truncate if too long, but keep the extension
+    name, ext = os.path.splitext(filename)
+    if len(name) > max_length:
+        # Use a hash of the full name to ensure uniqueness
+        name_hash = hashlib.md5(name.encode()).hexdigest()[:8]
+        name = name[:max_length-9] + '_' + name_hash
+    
+    return name + ext
+
 def download_media(url):
    global successful_downloads, failed_downloads
    try:
-        if url.lower().endswith(('.gif', '.mp4', '.png')):
+        if url.lower().endswith(SUPPORTED_EXTENSIONS):
            direct_url = url
        elif 'tenor.com' in url:
            gif_url = get_tenor_gif_url(url)
            if gif_url:
                direct_url = gif_url
            else:
-                logger.debug(f"Skipped Tenor URL: {url}")
                failed_downloads += 1
+                error_summary["Tenor URL skipped"].append(url)
                return
        else:
            direct_url = url
@ -97,19 +133,11 @@ def download_media(url):
        
        content_type = response.headers.get('Content-Type', '').lower()
        
-        if 'image/gif' in content_type or direct_url.lower().endswith('.gif'):
-            extension = '.gif'
-            subfolder = 'gif'
-        elif 'video/mp4' in content_type or direct_url.lower().endswith('.mp4'):
-            extension = '.mp4'
-            subfolder = 'mp4'
-        elif 'image/png' in content_type or direct_url.lower().endswith('.png'):
-            extension = '.png'
-            subfolder = 'gif'
-        else:
-            logger.debug(f"Skipped unsupported content type: {content_type} for URL: {direct_url}")
-            failed_downloads += 1
-            return
+        extension, subfolder = get_extension_and_subfolder(content_type, direct_url)
+        if not extension:
+           failed_downloads += 1
+           error_summary["Unsupported content type"].append(f"{content_type} - {direct_url}")
+           return

        parsed_url = urlparse(unquote(direct_url))
        filename = os.path.basename(parsed_url.path)
@ -119,9 +147,7 @@ def download_media(url):
            path_parts = parsed_url.path.rstrip('/').split('/')
            filename = path_parts[-1] if path_parts else 'unnamed'
        
-        filename = re.sub(r'[^\w\-_\. ]', '_', filename)
-        filename = filename.strip() or 'unnamed'
-        filename += extension
+        filename = safe_filename(filename + extension)

        download_dir = os.path.join('downloaded', subfolder)
        ensure_directory(download_dir)
@ -140,23 +166,52 @@ def download_media(url):
        progress = (successful_downloads + failed_downloads) / total_urls * 100
        logger.info(f"Downloaded: {filename} ({progress:.1f}% complete)")
    except requests.exceptions.RequestException as e:
-        if isinstance(e, requests.exceptions.HTTPError) and e.response.status_code == 404:
-            logger.debug(f"404 Not Found: {url}")
-        else:
-            logger.warning(f"Failed to download: {url}")
        failed_downloads += 1
+        if isinstance(e, requests.exceptions.HTTPError):
+            error_summary[f"HTTP {e.response.status_code}"].append(url)
+        else:
+            error_summary["Other errors"].append(f"{url} - {str(e)}")

-def main():
-    global total_urls
-    with open('encoded_file.txt', 'r', encoding='utf-8') as file:
+
+def read_input_file(file_path):
+    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

+    if file_path.lower().endswith('.json'):
+        try:
+            json_data = json.loads(content)
+            if 'settings' in json_data:
+                content = json_data['settings']
+            else:
+                logger.warning("JSON file does not contain 'settings' key. Using raw content.")
+        except json.JSONDecodeError:
+            logger.warning("Invalid JSON format. Using raw content.")
+
    try:
        decoded_content = base64.b64decode(content).decode('utf-8', errors='ignore')
    except (base64.binascii.Error, UnicodeDecodeError):
        logger.warning("Content is not valid base64 or couldn't be decoded. Using raw content.")
        decoded_content = content

+    return decoded_content
+
+def get_input_file():
+    for ext in ['txt', 'json']:
+        filename = f'data.{ext}'
+        if os.path.exists(filename):
+            return filename
+    logger.error("No valid input file found. Please ensure 'data.txt' or 'data.json' exists.\nNote: If your filename is 'data.txt', only raw data from 'settings' key must be inside of it.")
+    return None
+
+def main():
+    global total_urls
+    
+    input_file = get_input_file()
+    if not input_file:
+        return
+
+    decoded_content = read_input_file(input_file)
+
    urls = extract_and_fix_urls(decoded_content)
    total_urls = len(urls)

@ -170,8 +225,25 @@ def main():
    logger.info(f"Failed downloads: {failed_downloads}")
    logger.info(f"Success rate: {successful_downloads/total_urls*100:.1f}%")

+    # Print error summary
+    if error_summary:
+        logger.info("\n--- Error Summary ---")
+        for error_type, urls in error_summary.items():
+            logger.info(f"{error_type}: {len(urls)} occurences")
+            if error_type == "HTTP 404":
+                logger.info("Sample URLs (max 5):")
+                for url in urls[:5]:
+                    logger.info(f"  - {url}")
+            elif len(urls) <= 5:
+                for url in urls:
+                    logger.info(f"  - {url}")
+            else:
+                logger.info(f"  (Showing first 5 of {len(urls)} errors)")
+                for url in urls[:5]:
+                    logger.info(f"  - {url}")
+
    # Pause for 10 seconds
-    logger.info("\nScript finished. Closing in 10 seconds...")
+    logger.info("\nScript finished. Exiting in 10 seconds...")
    time.sleep(10)

 if __name__ == "__main__":