o
    h-                     @   sj   d dl Z d dlmZ d dlZd dlmZmZmZ d dlZd dl	Z	d dl
Z
ddlmZmZ G dd dZdS )    N)Counter)ListDictUnion   )STOPWORDS_ENSTOPWORDS_NEc                
   @   s.  e Zd ZedZejdejdZedZe	ddgZ
e Ze ZdddZd	d
ddd
ddZd*dedefddZdefddZejdddedefddZejdddedefddZdededefdd Zd+deded"edee fd#d$Zd%edeeeee ef f fd&d'Z d(d) Z!dS ),StanzaSEOMetadataExtractorz[\u0900-\u097F]z[^\w\s\u0900-\u097F])flagsz^\d+$NOUNPROPNF)enhitokenize,pos,lemma@   )
processorstokenize_batch_sizetokenize,posTN	use_cachemax_workersc                 C   s   || _ |ptdt | _i | _| jK | jd s-ztj	dddd d| jd< W n   Y | jd sNztj	dddd d| jd< W n   Y W d	   d	S W d	   d	S W d	   d	S 1 saw   Y  d	S )
z
        Initialize a more efficient Stanza-based keyword extractor.
        
        Args:
            use_cache: Whether to cache results for repeated inputs
            max_workers: Maximum number of worker threads (defaults to CPU count)
           r   r   WARN)r   logging_levelTr   r   N)
r   minmultiprocessing	cpu_countr   _keyword_cache_download_lock_downloadedstanzadownload)selfr   r    r"   >/var/www/html/mimamsha/mimansha/apps/mimansha_main/seoutils.py__init__%   s*   

"z#StanzaSEOMetadataExtractor.__init__langc                 C   sf   t | jds
i | j_|| jjvr-| j|| jd }tj||d ddd|d d| jj|< | jj| S )	z?Get a thread-local pipeline instance for the specified language	pipelinesr   r   NFr   r   )r%   r   download_methoduse_gpur   r   )hasattr_local_storager&   _pipeline_configsgetr   Pipeline)r!   r%   configr"   r"   r#   _get_pipelineA   s   	z(StanzaSEOMetadataExtractor._get_pipeline   )maxsizetextreturnc                 C   s    |dd }| j |rdS dS )z?Detect language using script detection (cached for performance)Ni  ner   )DEVANAGARI_PATTERNsearch)r!   r2   sampler"   r"   r#   detect_languageS   s   z*StanzaSEOMetadataExtractor.detect_languagec                 C   s   | j d| S )z2Clean text with cached results for repeated inputs )CLEAN_TEXT_PATTERNsubstrip)r!   r2   r"   r"   r#   
clean_textZ   s   z%StanzaSEOMetadataExtractor.clean_text	word_textc                 C   sP   t |dkrdS | j|rdS |dkr| tv rdS |dkr&|tv r&dS dS )z5Check if word is a valid keyword (not stopword, etc.)   Fr   r4   T)lenNUMBER_PATTERN	fullmatchlowerr   r   )r!   r>   r%   r"   r"   r#   _is_valid_keyword_   s   z,StanzaSEOMetadataExtractor._is_valid_keywordF   top_nc                 C   s4  dt | }| jr|| jv r| j| S t|dk rg S | |}|du r*| |}| |dkr2dnd}||}g }|jD ]'}	|	jD ]!}
|
j	| j
v rd|dkrT|
j n|
j }| ||rd|| qCq>dd t||D }| jr|| j|< t| jd	krt| j dd
 }|D ]	}| j|d q|S )z3Extract most significant keywords using POS taggingkw_
   Nr4   r   r   c                 S   s   g | ]\}}|qS r"   r"   ).0kw_r"   r"   r#   
<listcomp>   s    z?StanzaSEOMetadataExtractor.extract_keywords.<locals>.<listcomp>i     )hashr   r   r@   r=   r8   r/   	sentenceswordsupos
TARGET_POSr2   rC   lemmarD   appendr   most_commonlistkeyspop)r!   r2   r%   rF   	cache_keycleaned_textnlpdockeywordssentwordr>   resultkeys_to_removekeyr"   r"   r#   extract_keywordsq   s6   





	
z+StanzaSEOMetadataExtractor.extract_keywords	blog_textc                 C   s*   |  |}| |}| ||}||dS )z9Main utility function to get SEO metadata (keywords only))meta_keywordslanguage)r=   r8   rc   )r!   rd   rZ   r%   r]   r"   r"   r#   get_metadata   s   

z'StanzaSEOMetadataExtractor.get_metadatac                 C   s"   | j   | j  | j  dS )zClear the cache to free memoryN)r   clearr8   cache_clearr=   )r!   r"   r"   r#   clear_cache   s   

z&StanzaSEOMetadataExtractor.clear_cache)TN)NrE   )"__name__
__module____qualname__recompiler5   UNICODEr:   rA   	frozensetrR   	threadinglocalr*   Lockr   r   r+   boolintr$   strr/   	functools	lru_cacher8   r=   rD   r   rc   r   r   rg   rj   r"   r"   r"   r#   r	   
   s2    




 &3r	   )r   collectionsr   rn   typingr   r   r   rx   rr   r   	stopwordsr   r   r	   r"   r"   r"   r#   <module>   s    