import stanza
from collections import Counter
import re
from typing import List, Dict, Union
import functools
import threading
import multiprocessing
from .stopwords import STOPWORDS_EN, STOPWORDS_NE

class StanzaSEOMetadataExtractor:
    # Pre-compile regex patterns for performance
    DEVANAGARI_PATTERN = re.compile(r'[\u0900-\u097F]')
    CLEAN_TEXT_PATTERN = re.compile(r'[^\w\s\u0900-\u097F]', flags=re.UNICODE)
    NUMBER_PATTERN = re.compile(r'^\d+$')
    
    # For lightweight processing
    TARGET_POS = frozenset(['NOUN', 'PROPN'])
    
    # Thread-local storage for pipeline instances
    _local_storage = threading.local()
    # Prevent duplicate downloads
    _download_lock = threading.Lock()
    _downloaded = {'en': False, 'hi': False}
    
    # Class-level pipeline configuration
    _pipeline_configs = {
        'en': {
            'processors': 'tokenize,pos,lemma',
            'tokenize_batch_size': 64,
        },
        'hi': {
            'processors': 'tokenize,pos',
            'tokenize_batch_size': 64,
        }
    }
    
    def __init__(self, use_cache: bool = True, max_workers: int = None):
        """
        Initialize a more efficient Stanza-based keyword extractor.
        
        Args:
            use_cache: Whether to cache results for repeated inputs
            max_workers: Maximum number of worker threads (defaults to CPU count)
        """
        self.use_cache = use_cache
        self.max_workers = max_workers or min(4, multiprocessing.cpu_count())
        self._keyword_cache = {}  # Simple in-memory cache for keywords
        
        # Ensure models are downloaded just once
        with self._download_lock:
            if not self._downloaded['en']:
                try:
                    stanza.download('en', processors='tokenize,pos,lemma', logging_level='WARN')
                    self._downloaded['en'] = True
                except:
                    pass  # Assume already downloaded if fails
            
            if not self._downloaded['hi']:
                try:
                    stanza.download('hi', processors='tokenize,pos', logging_level='WARN')
                    self._downloaded['hi'] = True
                except:
                    pass  # Assume already downloaded if fails
    
    def _get_pipeline(self, lang: str):
        """Get a thread-local pipeline instance for the specified language"""
        if not hasattr(self._local_storage, 'pipelines'):
            self._local_storage.pipelines = {}
        
        if lang not in self._local_storage.pipelines:
            config = self._pipeline_configs.get(lang, self._pipeline_configs['en'])
            self._local_storage.pipelines[lang] = stanza.Pipeline(
                lang=lang,
                processors=config['processors'],
                download_method=None,  # Disable auto-download
                use_gpu=False,         # Force CPU mode
                logging_level='WARN',  # Reduce logging overhead
                tokenize_batch_size=config['tokenize_batch_size']
            )
        
        return self._local_storage.pipelines[lang]
    
    @functools.lru_cache(maxsize=128)
    def detect_language(self, text: str) -> str:
        """Detect language using script detection (cached for performance)"""
        # Use a small sample of text to speed up detection
        sample = text[:500]
        return 'ne' if self.DEVANAGARI_PATTERN.search(sample) else 'en'
    
    @functools.lru_cache(maxsize=128)
    def clean_text(self, text: str) -> str:
        """Clean text with cached results for repeated inputs"""
        return self.CLEAN_TEXT_PATTERN.sub(' ', text).strip()
    
    def _is_valid_keyword(self, word_text: str, lang: str) -> bool:
        """Check if word is a valid keyword (not stopword, etc.)"""
        if len(word_text) <= 2:
            return False
        
        if self.NUMBER_PATTERN.fullmatch(word_text):
            return False
            
        # Filter out stopwords based on language
        if lang == 'en' and word_text.lower() in STOPWORDS_EN:
            return False
        
        # Filter out Nepali stopwords
        if lang == 'ne' and word_text in STOPWORDS_NE:
            return False
            
        return True
    
    def extract_keywords(self, text: str, lang: str = None, top_n: int = 70) -> List[str]:
        """Extract most significant keywords using POS tagging"""
        # Check cache if enabled
        cache_key = f"kw_{hash(text)}"
        if self.use_cache and cache_key in self._keyword_cache:
            return self._keyword_cache[cache_key]
        
        # Skip processing for very short texts
        if len(text) < 10:
            return []
        
        # Clean and normalize text
        cleaned_text = self.clean_text(text)
        
        # Detect language if not provided
        if lang is None:
            lang = self.detect_language(cleaned_text)
        
        # Get the appropriate pipeline
        nlp = self._get_pipeline('hi' if lang == 'ne' else 'en')
        
        # Process the document
        doc = nlp(cleaned_text)
        
        # Collect keywords
        keywords = []
        
        for sent in doc.sentences:
            for word in sent.words:
                if word.upos in self.TARGET_POS:
                    # For Nepali: Use raw text (no reliable lemmatizer)
                    word_text = word.text.lower() if lang == 'ne' else word.lemma.lower()
                    
                    if self._is_valid_keyword(word_text, lang):
                        keywords.append(word_text)
        
        # Get most common keywords
        result = [kw for kw, _ in Counter(keywords).most_common(top_n)]
        
        # Cache result if enabled
        if self.use_cache:
            self._keyword_cache[cache_key] = result
            # Basic cache size management
            if len(self._keyword_cache) > 1000:
                # Remove oldest 20% of entries
                keys_to_remove = list(self._keyword_cache.keys())[:200]
                for key in keys_to_remove:
                    self._keyword_cache.pop(key, None)
            
        return result
    
    def get_metadata(self, blog_text: str) -> Dict[str, Union[List[str], str]]:
        """Main utility function to get SEO metadata (keywords only)"""
        # Clean and normalize text
        cleaned_text = self.clean_text(blog_text)
        lang = self.detect_language(cleaned_text)
        
        # Extract keywords
        keywords = self.extract_keywords(cleaned_text, lang)
        
        # Return only keywords and language
        return {
            'meta_keywords': keywords,
            'language': lang
        }
    
    def clear_cache(self):
        """Clear the cache to free memory"""
        self._keyword_cache.clear()
        # Also clear function-level LRU caches
        self.detect_language.cache_clear()
        self.clean_text.cache_clear()