2024-07-06 09:27:59 +00:00
"""
Licensed under the Apache License , Version 2.0 ( the " License " ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : / / www . apache . org / licenses / LICENSE - 2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an " AS IS " BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
Author : nloginov
Script Name : Discord Favorite Gif Downloader
"""
2024-07-05 20:39:50 +00:00
import base64
import re
import requests
from urllib . parse import urlparse , unquote
import os
from bs4 import BeautifulSoup
import logging
import time
2024-07-06 09:27:59 +00:00
import json
from collections import defaultdict
import hashlib
2024-07-05 20:39:50 +00:00
# Set up logging
logging . basicConfig ( level = logging . INFO , format = ' %(message)s ' )
logger = logging . getLogger ( )
2024-07-06 09:27:59 +00:00
# Global counters and error tracking
2024-07-05 20:39:50 +00:00
total_urls = 0
successful_downloads = 0
failed_downloads = 0
2024-07-06 09:27:59 +00:00
error_summary = defaultdict ( list )
2024-07-05 20:39:50 +00:00
def ensure_directory ( directory ) :
if not os . path . exists ( directory ) :
os . makedirs ( directory )
def extract_and_fix_urls ( text ) :
pattern = r ' https?:?/+[a-zA-Z0-9 \ -._~:/?#[ \ ]@!$& \' ()*+,;= % ]+ '
urls = re . findall ( pattern , text )
fixed_urls = [ ]
for url in urls :
if url . startswith ( ' http/ ' ) :
url = ' http:// ' + url [ 5 : ]
elif url . startswith ( ' https/ ' ) :
url = ' https:// ' + url [ 6 : ]
url = re . sub ( r ' ^(https?:)/+ ' , r ' \ 1// ' , url )
if ' discordapp.net/external/ ' in url :
parsed = urlparse ( url )
query = parsed . path . split ( ' / ' ) [ - 1 ]
if query . startswith ( ' http ' ) :
url = unquote ( query )
fixed_urls . append ( url )
return fixed_urls
2024-07-06 10:35:02 +00:00
def get_imgur_url ( imgur_url ) :
try :
2024-07-07 12:23:33 +00:00
# Format for Imgur images
fmt_url = r ' https://i \ .imgur \ .com/ \ w+ \ .(?:jpg|png|gif|mp4) '
2024-07-06 10:35:02 +00:00
# Handle album URLs
if ' /a/ ' in imgur_url :
response = requests . get ( imgur_url , timeout = 10 )
response . raise_for_status ( )
# Extract all image URLs from the album page
2024-07-07 12:23:33 +00:00
image_urls = re . findall ( fmt_url , response . text )
2024-07-06 10:35:02 +00:00
return image_urls if image_urls else None
# Handle single image/video URLs
response = requests . get ( imgur_url , timeout = 10 )
response . raise_for_status ( )
content = response . text
# Try to find direct image/video URL in the page source
2024-07-07 12:23:33 +00:00
match = re . search ( fmt_url , content )
2024-07-06 10:35:02 +00:00
if match :
return match . group ( 0 )
# If direct URL not found, construct it from the imgur ID
imgur_id = imgur_url . split ( ' / ' ) [ - 1 ]
return f ' https://i.imgur.com/ { imgur_id } .jpg '
except requests . exceptions . RequestException :
pass # Silently handle the error
return None
2024-07-05 20:39:50 +00:00
def get_tenor_gif_url ( tenor_url ) :
try :
response = requests . get ( tenor_url , timeout = 10 )
response . raise_for_status ( )
soup = BeautifulSoup ( response . text , ' html.parser ' )
gif_element = soup . select_one ( ' div.Gif img ' )
if gif_element and ' src ' in gif_element . attrs :
return gif_element [ ' src ' ]
meta_content_url = soup . select_one ( ' meta[itemprop= " contentUrl " ] ' )
if meta_content_url and ' content ' in meta_content_url . attrs :
return meta_content_url [ ' content ' ]
except requests . exceptions . RequestException :
pass # Silently handle the error
return None
2024-07-06 09:27:59 +00:00
CONTENT_TYPES = {
' image/gif ' : ( ' .gif ' , ' gif ' ) ,
' video/mp4 ' : ( ' .mp4 ' , ' mp4 ' ) ,
' image/png ' : ( ' .png ' , ' img ' ) ,
' image/jpeg ' : ( ' .jpg ' , ' img ' ) ,
' video/webm ' : ( ' .webm ' , ' webm ' ) ,
' image/webp ' : ( ' .webp ' , ' img ' )
}
SUPPORTED_EXTENSIONS = tuple ( ext for ext , _ in CONTENT_TYPES . values ( ) )
def get_extension_and_subfolder ( content_type , direct_url ) :
for mime , ( ext , subfolder ) in CONTENT_TYPES . items ( ) :
if mime in content_type or direct_url . lower ( ) . endswith ( ext ) :
return ext , subfolder
return None , None
def safe_filename ( filename , max_length = 200 ) :
# Remove invalid characters
filename = re . sub ( r ' [<>: " / \\ |?*] ' , ' ' , filename )
# Truncate if too long, but keep the extension
name , ext = os . path . splitext ( filename )
if len ( name ) > max_length :
# Use a hash of the full name to ensure uniqueness
name_hash = hashlib . md5 ( name . encode ( ) ) . hexdigest ( ) [ : 8 ]
name = name [ : max_length - 9 ] + ' _ ' + name_hash
return name + ext
2024-07-05 20:39:50 +00:00
def download_media ( url ) :
global successful_downloads , failed_downloads
try :
2024-07-06 10:35:02 +00:00
if ' imgur.com ' in url :
imgur_urls = get_imgur_url ( url )
if imgur_urls :
if isinstance ( imgur_urls , list ) : # It's an album
for imgur_url in imgur_urls :
download_media ( imgur_url ) # Recursive call for each image in the album
return
else : # Single image/video
direct_url = imgur_urls
else :
failed_downloads + = 1
error_summary [ " Imgur URL skipped " ] . append ( url )
return
2024-07-05 20:39:50 +00:00
elif ' tenor.com ' in url :
gif_url = get_tenor_gif_url ( url )
if gif_url :
direct_url = gif_url
else :
failed_downloads + = 1
2024-07-06 09:27:59 +00:00
error_summary [ " Tenor URL skipped " ] . append ( url )
2024-07-05 20:39:50 +00:00
return
2024-07-06 10:35:02 +00:00
elif url . lower ( ) . endswith ( SUPPORTED_EXTENSIONS ) :
direct_url = url
2024-07-05 20:39:50 +00:00
else :
direct_url = url
response = requests . get ( direct_url , timeout = 10 , allow_redirects = True )
response . raise_for_status ( )
content_type = response . headers . get ( ' Content-Type ' , ' ' ) . lower ( )
2024-07-06 09:27:59 +00:00
extension , subfolder = get_extension_and_subfolder ( content_type , direct_url )
if not extension :
failed_downloads + = 1
error_summary [ " Unsupported content type " ] . append ( f " { content_type } - { direct_url } " )
return
2024-07-05 20:39:50 +00:00
parsed_url = urlparse ( unquote ( direct_url ) )
filename = os . path . basename ( parsed_url . path )
filename , _ = os . path . splitext ( filename )
if not filename or filename == extension :
path_parts = parsed_url . path . rstrip ( ' / ' ) . split ( ' / ' )
filename = path_parts [ - 1 ] if path_parts else ' unnamed '
2024-07-06 09:27:59 +00:00
filename = safe_filename ( filename + extension )
2024-07-05 20:39:50 +00:00
download_dir = os . path . join ( ' downloaded ' , subfolder )
ensure_directory ( download_dir )
counter = 1
original_filename = filename
while os . path . exists ( os . path . join ( download_dir , filename ) ) :
name , ext = os . path . splitext ( original_filename )
filename = f " { name } _ { counter } { ext } "
counter + = 1
full_path = os . path . join ( download_dir , filename )
with open ( full_path , ' wb ' ) as file :
file . write ( response . content )
successful_downloads + = 1
progress = ( successful_downloads + failed_downloads ) / total_urls * 100
logger . info ( f " Downloaded: { filename } ( { progress : .1f } % complete) " )
except requests . exceptions . RequestException as e :
failed_downloads + = 1
2024-07-06 09:27:59 +00:00
if isinstance ( e , requests . exceptions . HTTPError ) :
error_summary [ f " HTTP { e . response . status_code } " ] . append ( url )
else :
error_summary [ " Other errors " ] . append ( f " { url } - { str ( e ) } " )
2024-07-05 20:39:50 +00:00
2024-07-06 09:27:59 +00:00
def read_input_file ( file_path ) :
with open ( file_path , ' r ' , encoding = ' utf-8 ' ) as file :
2024-07-05 20:39:50 +00:00
content = file . read ( )
2024-07-06 09:27:59 +00:00
if file_path . lower ( ) . endswith ( ' .json ' ) :
try :
json_data = json . loads ( content )
if ' settings ' in json_data :
content = json_data [ ' settings ' ]
else :
logger . warning ( " JSON file does not contain ' settings ' key. Using raw content. " )
except json . JSONDecodeError :
logger . warning ( " Invalid JSON format. Using raw content. " )
2024-07-05 20:39:50 +00:00
try :
decoded_content = base64 . b64decode ( content ) . decode ( ' utf-8 ' , errors = ' ignore ' )
except ( base64 . binascii . Error , UnicodeDecodeError ) :
logger . warning ( " Content is not valid base64 or couldn ' t be decoded. Using raw content. " )
decoded_content = content
2024-07-06 09:27:59 +00:00
return decoded_content
def get_input_file ( ) :
for ext in [ ' txt ' , ' json ' ] :
filename = f ' data. { ext } '
if os . path . exists ( filename ) :
return filename
logger . error ( " No valid input file found. Please ensure ' data.txt ' or ' data.json ' exists. \n Note: If your filename is ' data.txt ' , only raw data from ' settings ' key must be inside of it. " )
return None
def main ( ) :
global total_urls
input_file = get_input_file ( )
if not input_file :
return
decoded_content = read_input_file ( input_file )
2024-07-05 20:39:50 +00:00
urls = extract_and_fix_urls ( decoded_content )
total_urls = len ( urls )
for url in urls :
download_media ( url )
# Print statistics
logger . info ( " \n --- Download Statistics --- " )
logger . info ( f " Total URLs processed: { total_urls } " )
logger . info ( f " Successful downloads: { successful_downloads } " )
logger . info ( f " Failed downloads: { failed_downloads } " )
logger . info ( f " Success rate: { successful_downloads / total_urls * 100 : .1f } % " )
2024-07-06 09:27:59 +00:00
# Print error summary
if error_summary :
logger . info ( " \n --- Error Summary --- " )
for error_type , urls in error_summary . items ( ) :
logger . info ( f " { error_type } : { len ( urls ) } occurences " )
if error_type == " HTTP 404 " :
logger . info ( " Sample URLs (max 5): " )
for url in urls [ : 5 ] :
logger . info ( f " - { url } " )
elif len ( urls ) < = 5 :
for url in urls :
logger . info ( f " - { url } " )
else :
logger . info ( f " (Showing first 5 of { len ( urls ) } errors) " )
for url in urls [ : 5 ] :
logger . info ( f " - { url } " )
2024-07-05 20:39:50 +00:00
# Pause for 10 seconds
2024-07-06 09:27:59 +00:00
logger . info ( " \n Script finished. Exiting in 10 seconds... " )
2024-07-05 20:39:50 +00:00
time . sleep ( 10 )
if __name__ == " __main__ " :
2024-07-06 09:27:59 +00:00
main ( )