o
    h                     @   s.   d Z ddlZddlmZ G dd dejZdS )a  
A layer which implements maxout from the "Maxout Networks" paper

https://arxiv.org/pdf/1302.4389v4.pdf
Goodfellow, Warde-Farley, Mirza, Courville, Bengio

or a simpler explanation here:

https://stats.stackexchange.com/questions/129698/what-is-maxout-in-neural-network/298705#298705

The implementation here:
for k layers of maxout, in -> out channels, we make a single linear
  map of size in -> out*k
then we reshape the end to be (..., k, out)
and return the max over the k layers
    Nc                       s$   e Zd Z fddZdd Z  ZS )MaxoutLinearc                    s2   t    || _|| _|| _t||| | _d S )N)super__init__in_channelsout_channelsmaxout_knnLinearlinear)selfr   r   r   	__class__ ]/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/common/maxout_linear.pyr      s
   
zMaxoutLinear.__init__c                 C   sF   |  |}|jg |jdd | j| jR  }tj|ddd }|S )z
        Use the oversized linear as the repeated linear, then take the max

        One large linear map makes the implementation simpler and easier for pytorch to make parallel
        N)dimr   )r
   viewshaper   r   torchmax)r   inputsoutputsr   r   r   forward    s   
&zMaxoutLinear.forward)__name__
__module____qualname__r   r   __classcell__r   r   r   r   r      s    	r   )__doc__r   torch.nnr   Moduler   r   r   r   r   <module>   s    