o
    h+                     @   s   d Z ddlZddlZddlZddlmZ ddlm  mZ ddl	m
Z
 G dd dejjjZG dd dejZG dd	 d	ejZG d
d dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZdS )zt
Transformer with partitioned content and position features.

See section 3 of https://arxiv.org/pdf/1805.01052.pdf
    N)ConcatSinusoidalEncodingc                   @   s&   e Zd ZedddZedd ZdS )	FeatureDropoutFunction      ?Fc                 C   s   |dk s|dkrt d||| _|| _|| _| jr#| | |}n| }| jdkrr| jrrtj|	d|	df|j
|j|jd| _| jdkrP| jd n| jd| j d| j  | jd d d d d f | _|| j |S )Nr      z9dropout probability has to be between 0 and 1, but got {})dtypelayoutdevice)
ValueErrorformatptraininplace
mark_dirtyclonetorchemptysizer   r   r	   noisefill_
bernoulli_div_mul_)ctxinputr   r   r   output r   m/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/constituency/partitioned_transformer.pyforward   s0   

zFeatureDropoutFunction.forwardc                 C   s0   | j dkr| jr|| jd d d fS |d d d fS )Nr   )r   r   mulr   )r   grad_outputr   r   r   backward2   s   zFeatureDropoutFunction.backwardN)r   FF)__name__
__module____qualname__staticmethodr   r!   r   r   r   r   r      s
     r   c                   @   s   e Zd ZdZdd ZdS )FeatureDropoutz
    Feature-level dropout: takes an input of size len x num_features and drops
    each feature with probabibility p. A feature is dropped across the full
    portion of the input that corresponds to a single batch element.
    c                 C   s\   t |tr#|\}}t|| j| j| j}t|| j| j| j}||fS t|| j| j| jS N)
isinstancetupler   applyr   trainingr   selfxx_cx_pr   r   r   r   A   s   
zFeatureDropout.forwardN)r"   r#   r$   __doc__r   r   r   r   r   r&   :   s    r&   c                       s   e Zd Z fddZ  ZS )PartitionedReLUc                    s@   t |tr
|\}}n
tj|ddd\}}t |t |fS N   r   dim)r(   r)   r   chunksuperr   r,   	__class__r   r   r   P   s   

zPartitionedReLU.forward)r"   r#   r$   r   __classcell__r   r   r9   r   r2   O   s    r2   c                       s&   e Zd Zd fdd	Zdd Z  ZS )PartitionedLinearTc                    s>   t    t|d |d || _t|d |d || _d S )Nr4   )r8   __init__nnLinearlinear_clinear_p)r-   in_featuresout_featuresbiasr9   r   r   r=   Y   s   
zPartitionedLinear.__init__c                 C   sD   t |tr
|\}}n
tj|ddd\}}| |}| |}||fS r3   )r(   r)   r   r7   r@   rA   )r-   r.   r/   r0   out_cout_pr   r   r   r   ^   s   



zPartitionedLinear.forward)Tr"   r#   r$   r=   r   r;   r   r   r9   r   r<   X   s    r<   c                       s*   e Zd Z	d fdd	Zd	ddZ  ZS )
PartitionedMultiHeadAttention皙?{Gz?c              	      s   t    tt||d d|d | _tt||d d|d | _tt||d |d | _tt||d |d | _	t
d| }| j| j| j| j	fD ]}tj|| | qTd|d  | _t|| _d S )Nr4      g      @r   r   )r8   r=   r>   	Parameterr   Tensorw_qkv_cw_qkv_pw_o_cw_o_pmathsqrtinituniform_scaling_factorDropoutdropout)r-   d_modeln_headd_qkvattention_dropoutinitializer_rangeboundparamr9   r   r   r=   j   s   
  z&PartitionedMultiHeadAttention.__init__Nc                 C   sf  t |tr
|\}}n
tj|ddd\}}td|| j}td|| j}dd tj|dddD \}}}	dd tj|dddD \}
}}tj||
gdd| j }tj||gdd}tj|	|gdd}td	||}|d ur|j	
|d d d d d d f  td
  tj|dd}| |}td||}tj|ddd\}}td|| j}td|| j}||fS )Nr4   r   r5   zbtf,hfca->bhtcac                 S      g | ]}|j d dqS rK   r5   squeeze.0cr   r   r   
<listcomp>       z9PartitionedMultiHeadAttention.forward.<locals>.<listcomp>rK   c                 S   r`   ra   rb   rd   r   r   r   rg      rh   zbhqa,bhka->bhqkinfzbhqk,bhka->bhqazbhta,haf->btf)r(   r)   r   r7   einsumrN   rO   catrV   datamasked_fill_floatFsoftmaxrX   rP   rQ   )r-   r.   maskr/   r0   qkv_cqkv_pq_ck_cv_cq_pk_pv_pqkvdotsprobsoo_co_prE   rF   r   r   r   r   {   s(   

  *
z%PartitionedMultiHeadAttention.forward)rI   rJ   r'   rG   r   r   r9   r   rH   i   s    rH   c                       s,   e Zd Ze f fdd	ZdddZ  ZS )"PartitionedTransformerEncoderLayerc	           	         st   t    t||||d| _t||| _t|| _t||| _t	
|| _t	
|| _t|| _t|| _|| _d S )N)r\   )r8   r=   rH   	self_attnr<   linear1r&   
ff_dropoutlinear2r>   	LayerNorm	norm_attnnorm_ffresidual_dropout_attnresidual_dropout_ff
activation)	r-   rY   rZ   r[   d_ffr   residual_dropoutr\   r   r9   r   r   r=      s   





z+PartitionedTransformerEncoderLayer.__init__Nc              	   C   sz   | j ||d}tj|dd}| |}| || }| | | | |}tj|dd}| 	|}| 
|| }|S )Nrq   r   r5   )r   r   rk   r   r   r   r   r   r   r   r   )r-   r.   rq   residualr   r   r   r      s   

z*PartitionedTransformerEncoderLayer.forwardr'   r"   r#   r$   r2   r=   r   r;   r   r   r9   r   r      s    	r   c                       s*   e Zd Zef fdd	ZdddZ  ZS )PartitionedTransformerEncoderc
           
   
      s:   t    t fddt|D | _d S )Nc                    s&   g | ]}t   d qS ))rY   rZ   r[   r   r   r   r\   r   )r   )re   ir   r\   r   rY   r[   r   rZ   r   r   r   rg      s    
z:PartitionedTransformerEncoder.__init__.<locals>.<listcomp>)r8   r=   r>   
ModuleListrangelayers)
r-   n_layersrY   rZ   r[   r   r   r   r\   r   r9   r   r   r=      s   
z&PartitionedTransformerEncoder.__init__Nc                 C   s   | j D ]}|||d}q|S )Nr   )r   )r-   r.   rq   layerr   r   r   r      s   
z%PartitionedTransformerEncoder.forwardr'   r   r   r   r9   r   r      s    
r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	ConcatPositionalEncodingz%
    Learns a position embedding
          c                    s0   t    tt||| _tj| j d S r'   )	r8   r=   r>   rL   r   FloatTensortiming_tablerT   normal_)r-   rY   max_lenr9   r   r   r=      s   
z!ConcatPositionalEncoding.__init__c                 C   sF   | j d |jd d d f }||jd dd}tj||gdd}|S )Nr   r   r   r5   )r   shapeexpandr   rk   )r-   r.   timingoutr   r   r   r      s   z ConcatPositionalEncoding.forward)r   r   )r"   r#   r$   r1   r=   r   r;   r   r   r9   r   r      s    r   c                       s*   e Zd Ze f fdd	Zdd Z  ZS )PartitionedTransformerModulec              
      s   t    tj|	|d |
d| _t|| _|dkr#t|d |d| _n|dkr1t	|d |d| _nt
d| t|| _t||||||||d| _d S )Nr4   )rD   sin)rY   r   learnedzUnhandled timing type: %s)rY   rZ   r[   r   r   r   r\   )r8   r=   r>   r?   project_pretrainedr&   pattention_morpho_emb_dropoutr   
add_timingr   r
   r   transformer_input_normr   pattn_encoder)r-   r   rY   rZ   r[   r   r   r   r\   word_input_sizerD   morpho_emb_dropoutr   encoder_max_lenr   r9   r   r   r=      s*   


z%PartitionedTransformerModule.__init__c                 C   s   |d j }|r
|}n"g }|D ]}|tjt||d qtjjjj|ddd}|dk}|j	|d}tjjjj|ddd}| 
|}	| | |	}
| |
}
| |
|}|S )Nr   )r	   Ti)batch_firstpadding_value)r	   appendr   oneslenr>   utilsrnnpad_sequencetor   r   r   r   r   )r-   attention_maskbert_embeddingsr	   valid_token_maskvalidssentpadded_datapadded_embeddingsextra_content_annotations
encoder_inannotationsr   r   r   r     s.   




z$PartitionedTransformerModule.forwardr   r   r   r9   r   r      s    *r   )r1   copyrR   r   torch.nnr>   torch.nn.functional
functionalro   .stanza.models.constituency.positional_encodingr   autogradfunctionInplaceFunctionr   rW   r&   ReLUr2   Moduler<   rH   r   r   r   r   r   r   r   r   <module>   s     *	*&