from translatepy import Translator from django.utils.text import slugify from bs4 import BeautifulSoup from .stopwords import GENERAL_META_DESCRIPTION, GENERAL_META_KEYWORDS translator = Translator() def translate_to_english(text): try: translation = translator.translate(text, "English") return translation.result except Exception: return text def extract_superscripts(html_text): """ Extracts all superscript text enclosed in ^{tags from an HTML string.

Args:
html_text (str): The HTML content to parse

Returns:
list: A list of all superscript texts found in order of appearance
"""
soup = BeautifulSoup(html_text, 'html.parser')
superscripts = [sup.get_text() for sup in soup.find_all('sup')]
return superscripts

def html_to_plain_text(html_content):
"""
Convert HTML content to plain text by removing HTML tags and decoding HTML entities.

Args:
html_content (str): HTML content to convert

Returns:
str: Plain text without HTML tags and with decoded HTML entities
"""
import re
from html import unescape

# Remove DOCTYPE and comments
html_content = re.sub(r'', '', html_content, flags=re.DOTALL)
html_content = re.sub(r'', '', html_content, flags=re.DOTALL)

# Replace common block elements with newlines before and after
for tag in ['div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'br', 'tr']:
html_content = re.sub(fr'<\s*{tag}[^>]*>', '\n', html_content, flags=re.IGNORECASE)
html_content = re.sub(fr'<\s*/\s*{tag}\s*>', '\n', html_content, flags=re.IGNORECASE)

# Special handling for line breaks
html_content = re.sub(r'<\s*br\s*/?>', '\n', html_content, flags=re.IGNORECASE)

# Remove all remaining HTML tags
html_content = re.sub(r'<[^>]*>', '', html_content)

# Decode HTML entities
html_content = unescape(html_content)

# Fix whitespace
# Replace multiple spaces with a single space
html_content = re.sub(r' +', ' ', html_content)
# Replace multiple newlines with a maximum of two newlines
html_content = re.sub(r'\n{3,}', '\n\n', html_content)
# Remove leading and trailing whitespace on each line
html_content = '\n'.join(line.strip() for line in html_content.splitlines())
# Remove leading and trailing whitespace from the entire text
html_content = html_content.strip()

return html_content

from bs4 import BeautifulSoup
import re

def process_footnotes(html_content):
"""
Processes HTML content with numbered references in [1] format and creates a footnotes section.
Formats reference numbers as superscript with specific styling.

Args:
html_content (str): HTML string containing references like [1] and footnotes in the format:
[footnotes]
[1] Footnote text...
[2] Another footnote...
[/footnotes]

Returns:
dict: {
'processed_content': HTML with linked superscript footnotes,
'footnotes_template': Formatted footnotes section with back-to-text links
}
"""
# Extract the footnotes block using regex
footnotes_block = re.search(r'\[footnotes\](.*?)\[\/footnotes\]', html_content, re.DOTALL)

if not footnotes_block:
return {
'processed_content': html_content,
'footnotes_template': ''
}

footnotes_text = footnotes_block.group(1).strip()

# Parse individual footnotes ([1] Note text format)
footnotes = {}
for match in re.finditer(r'\[(\d+)\]\s*(.*?)(?=\n\[\d+\]|\Z|\n\s*\n)', footnotes_text, re.MULTILINE | re.DOTALL):
num = match.group(1)
text = match.group(2).strip()
footnotes[int(num)] = text

# If no footnotes were found, try a less strict pattern
if not footnotes:
pattern = r'\[(\d+)\](.*?)(?=\n\[\d+\]|\Z)'
matches = re.finditer(pattern, footnotes_text, re.MULTILINE | re.DOTALL)
for match in matches:
num = match.group(1)
text = match.group(2).strip()
footnotes[int(num)] = text

# If still no footnotes, try an even simpler pattern
if not footnotes:
lines = footnotes_text.split('\n')
for line in lines:
match = re.match(r'\[(\d+)\](.*)', line)
if match:
num = match.group(1)
text = match.group(2).strip()
footnotes[int(num)] = text

# Remove the original footnotes block from content
processed_html = re.sub(r'\[footnotes\](.*?)\[\/footnotes\]', '', html_content, flags=re.DOTALL)

# Process text to add superscript links for [1] style references
soup = BeautifulSoup(processed_html, 'html.parser')

# Find all text nodes that contain [number] patterns
for text_node in soup.find_all(string=True):
if not text_node.parent or text_node.parent.name in ['script', 'style']:
continue

# Replace [number] with superscript link
new_text = re.sub(
r'\[(\d+)\]',
lambda m: f'^{m.group(1)}',
str(text_node)
)

if new_text != str(text_node):
new_soup = BeautifulSoup(new_text, 'html.parser')
text_node.replace_with(new_soup)

processed_content = str(soup)

# Generate footnotes template with a simpler, academic style
footnotes_template = """

References

"""

for num, text in sorted(footnotes.items()):
footnotes_template += f"""

[{num}]

{text}

"""

footnotes_template += """

"""

return {
'processed_content': processed_content,
'footnotes_template': footnotes_template
}

from django.conf import settings

def get_meta_contents(
meta_title="MIMAMSHA",
meta_description=GENERAL_META_DESCRIPTION,
meta_keywords=GENERAL_META_KEYWORDS,
og_image="/static/icons/MIMAMSHA.png",
twitter_image="/static/icons/MIMAMSHA.png",
canonical_url="https://www.mimamsha.com",
**kwargs # For any additional custom fields
):

return {
# Primary Meta Tags
'meta_title': meta_title,
'meta_description': meta_description,
'meta_keywords': meta_keywords,
'author': 'MIMAMSHA',

# OpenGraph/Facebook Meta
'og_type': 'website',
'canonical_url': canonical_url,
'og_title': meta_title, # Usually same as meta_title
'og_description': meta_description,
'og_image': og_image,
'site_name': 'MIMAMSHA',

# Twitter Meta
'twitter_card': 'summary_large_image',
'twitter_title': meta_title,
'twitter_description': meta_description,
'twitter_image': twitter_image,
'twitter_site': '@mimamsha',
'twitter_creator': '@mimamsha',

# Technical Meta
'robots': 'index, follow',

# Allow additional custom fields
**kwargs
}}