from asyncio import sleep
from pathlib import Path
import argparse, csv, logging, re, sys, time
import urllib3
import requests
from ratelimit import limits, RateLimitException, sleep_and_retry
from urllib.parse import urlparse

# import stackprinter
# stackprinter.set_excepthook(style='darkbg2')

from memento_damage import utils
from memento_damage.analysis import DamageAnalysis

REPLAY_SRV = 'http://localhost:9990'

ONE_MINUTE = 60
MAX_CALLS_PER_MINUTE = 10

# Default viewport size
VIEWPORT = (1920, 1080)

# Log parameters
LOG_FILE = 'server.log'
LOG_FORMAT = logging.Formatter('%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
LOG_LEVEL = 30
log = None


'''
Command-line main entry
'''
def main():
    global LOG_LEVEL
    global log

    args = parseArgs()

    # Set log level from args
    if args.VERBOSE: LOG_LEVEL = logging.INFO
    if args.DEBUG: LOG_LEVEL = logging.DEBUG

    '''
    Log initialization
    '''
    try:
        utils.mkDir(args.CACHE)

        # Initialize server log file
        # logMode = 'a' if Path(cacheDir, LOG_FILE).is_file() and not args.IGNORE_CACHE else 'w'
        logMode = 'a'
        fileHandler = logging.FileHandler(Path(args.CACHE, LOG_FILE), mode=logMode)
        fileHandler.setFormatter(LOG_FORMAT)
        log = logging.getLogger('cli')
        log.addHandler(fileHandler)
        log.setLevel(LOG_LEVEL)
        # log.info('Server initialized')
    except:
        print('FATAL: Unable to initialize server cache')
        exit(1)

    '''
    Begin URI Processing
    '''
    urls = []

    # Multiple URI mode
    # If -w or --warc is present, they will be ignored in favor of file-structured format
    if args.URI.startswith('file:'):
        uriFilePath = args.URI[5:]
        try:
            if not Path(uriFilePath).is_file() or not uriFilePath.endswith('.csv'):
                raise FileNotFoundError(f'Unable to find input file: {uriFilePath}')

            with open(uriFilePath, newline='') as csvFile:
                csvUrls = csv.reader(csvFile, delimiter=',')
                cols = len(csvUrls[0])

                for row in csvUrls:
                    try:
                        url = row[0]
                        if (cols > 1 and len(row[1]) and (row[1].endswith('.wacz') or row[1].endswith('.warc'))):
                            warcFile = row[1]
                        
                        urls.append((url, None))
                    except:
                        raise ValueError(f'Unable to parse row: {row}')

            log.info(f'Input file loaded: {uriFilePath}')
        except FileNotFoundError as e:
            print(e)
            log.error(e)
            exit(1)
        except ValueError as e:
            print(e)
            log.error(e)
            exit(1)
        except:
            print(f'Unable to parse input file: {uriFilePath}')
            log.error(f'Unable to parse input file: {uriFilePath}')
            exit(1)

    # Single URI
    else:
        warcFile = args.WARC if args.WARC and (args.WARC.endswith('.wacz') or args.WARC.endswith('.warc')) else None
        urls.append((args.URI, warcFile))


    # Process provided URLs
    for item in urls:
        url, warc = item[0], item[1]

        if warc:
            if warc.startswith('http://') or warc.startswith('https://'):
                uriFolder = f'[{utils.uriToFoldername(warc)}]_{utils.uriToFoldername(url)}'
            else:
                if not args.WARC_DIR:
                    log.error(f'No directory specified for {warcFile}')
                    continue
                elif args.WARC_DIR and not Path(args.WARC_DIR, warc).is_file():
                    log.error(f'Archive not found at {Path(args.WARC_DIR, warc).absolute()}')
                    continue
                
                uriFolder = f'[{warc}]_{utils.uriToFoldername(url)}'

            uriCache = Path(args.CACHE, uriFolder).absolute()
            checkDamage(args, uriCache, url, args.WARC_DIR, warcFile=warc)
        else:
            print('\nChecking', url)
            url, error = dereferenceURI(url, redirect=args.REDIRECT, timeout=args.TIMEOUT)
            if error:
                log.error(error)
                continue
            else:
                uriFolder = utils.uriToFoldername(url)
                uriCache = Path(args.CACHE, uriFolder).absolute()
                iaUrlMatch = re.match((r'^(https?:\/\/web\.archive\.org\/web\/\d{14})(\/.*)'), url)
                if iaUrlMatch and not uriCache.is_dir():
                    url = f'{iaUrlMatch.group(1)}if_{iaUrlMatch.group(2)}'

                uriFolder = utils.uriToFoldername(url)
                uriCache = Path(args.CACHE, uriFolder).absolute()
                checkDamage(args, uriCache, url)


@sleep_and_retry
@limits(calls=MAX_CALLS_PER_MINUTE, period=ONE_MINUTE)
def dereferenceURI(uri, redirect=True, timeout=30):
    error = None

    # if not utils.validateURL(uri):
    #     return uri, f'Invalid URL: {uri}
    result = urlparse(uri)
    if not (result.scheme and result.netloc):
        return uri, f'Invalid URL: {uri}'

    uri = utils.rectifyURI(uri)

    try:
        response = requests.head(uri, allow_redirects=redirect, timeout=timeout)
        if response.status_code == 404:
            return uri, f'Page not found ({response.status_code}): {uri}'
    except requests.exceptions.ConnectionError:
        return uri, f'Connection error: {uri}'
    except requests.exceptions.Timeout:
        return uri, f'Timeout error: {uri}'
    except:
        return uri, f'Unable to dereference URI: {uri}'

    return response.url, None


@sleep_and_retry
@limits(calls=MAX_CALLS_PER_MINUTE, period=ONE_MINUTE)
def checkDamage(args, uriCache, uri, warcDir=None, warcFile=None):
        
    tS = time.time()
    
    analysis = DamageAnalysis(uriCache, uri, warcDir, warcFile, options=(
        args.DEBUG,
        args.IGNORE_CACHE,
        LOG_LEVEL,
        args.TIMEOUT,
        VIEWPORT
    ))
    analysis.analyze()

    tE = time.time()
    tP = tE - tS
    score, error = analysis.damageScore()

    log.error(score) if error else log.info(f"{float(score):.2f}% : {'['+warcFile+'] ' if warcFile else ''}{uri}")


def parseArgs():
    parser = argparse.ArgumentParser(
        prog='Memento Damage CLI',
        description='CLI utility for analysis of Mementos and web pages',
        usage='%(prog)s [options] <URI>',
        epilog='oduwsdl.github.io @WS-DL')

    parser.add_argument('-c', '--cache', dest='CACHE',
                        required=True,
                        help='Set specified cache path')
    parser.add_argument('-d', '--debug', dest='DEBUG',
                        action='store_true',
                        help='Enable debugging mode (default: off)')
    parser.add_argument('-i', '--ignore-cache', dest='IGNORE_CACHE',
                        action='store_true',
                        help='Ignore and overwrite existing cache data (default: off)')
    parser.add_argument('-r', '--redirect', dest='REDIRECT',
                        action='store_true',
                        help='Follow URI redirections')
    parser.add_argument('-t', '--timeout', dest='TIMEOUT',
                        type=int, choices=range(10, 300), default=30,
                        help='Crawl timeout (in seconds; 10 < t < 300; default: 30)')
    parser.add_argument('-V', '--verbose', dest='VERBOSE',
                        action='store_true',
                        help='Enable extended logging output')
    parser.add_argument('-v', '--version',
                        action='version', version='%(prog)s v3.0.0',
                        help='Display version information')
    parser.add_argument('-w', '--warc', dest='WARC',
                        help='WARC/WACZ file name to process at [REPLAY_SRV|localhost]')
    parser.add_argument('-W', '--warc-dir', dest='WARC_DIR',
                        help='Directory for WARC files')
    parser.add_argument('URI',
                        help='URI to analyze')

    args = parser.parse_args()

    if len(sys.argv) < 1:
        parser.print_help()
        exit(1)

    return args


if __name__ == '__main__':
    main()