# Instalare dependințe în Colab !pip install pdfplumber reportlab transformers torch sentencepiece matplotlib !wget https://github.com/dejavu-fonts/dejavu-fonts/releases/download/version_2_37/dejavu-fonts-ttf-2.37.tar.bz2 !tar -xjf dejavu-fonts-ttf-2.37.tar.bz2 !mv dejavu-fonts-ttf-2.37/ttf/DejaVuSans.ttf . import os from google.colab import files import pdfplumber from reportlab.lib.pagesizes import letter from reportlab.pdfgen import canvas from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase.ttfonts import TTFont from transformers import MarianTokenizer, MarianMTModel from tqdm import tqdm import re import torch from matplotlib import pyplot as plt from io import BytesIO # Verificăm și configurăm GPU A100 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Folosesc dispozitivul: {device}") # Încărcăm modelul de traducere local (EN -> RO) și îl mutăm pe GPU model_name = "Helsinki-NLP/opus-mt-en-ro" tokenizer = MarianTokenizer.from_pretrained(model_name) model = MarianMTModel.from_pretrained(model_name).to(device) def extract_text_lines_with_position(pdf_path): """Extrage textul pe linii cu poziționarea lor""" text_data = [] try: with pdfplumber.open(pdf_path) as pdf: total_pages = len(pdf.pages) with tqdm(total=total_pages, desc="Extragere text") as pbar: for page_num, page in enumerate(pdf.pages, 1): lines = page.extract_text_lines() for line in lines: text_data.append({ 'text': line['text'], 'x0': line['x0'], 'top': line['top'], 'page': page_num, 'width': page.width, 'height': page.height, 'x1': line['x1'] }) pbar.update(1) return text_data except Exception as e: print(f"Eroare la extragerea textului: {str(e)}") return None def detect_math_formulas(text): """Detectează formulele matematice între $...$ sau $$...$$""" math_pattern = r'\$\$?.*?\$\$?|\$.*?\$' formulas = re.findall(math_pattern, text) return formulas, re.sub(math_pattern, '###MATH###', text) def translate_text_batch(texts, from_lang="en", to_lang="ro", batch_size=32): """Traduce un lot de texte folosind GPU""" translated_texts = [] with tqdm(total=len(texts), desc="Traducere text") as pbar: for i in range(0, len(texts), batch_size): batch = texts[i:i + batch_size] formulas_batch = [] texts_with_placeholders = [] # Procesăm fiecare text din lot pentru a păstra formulele for text in batch: formulas, text_with_placeholders = detect_math_formulas(text) formulas_batch.append(formulas) texts_with_placeholders.append(text_with_placeholders) # Tokenizăm și traducem pe GPU inputs = tokenizer(texts_with_placeholders, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device) with torch.no_grad(): translated = model.generate(**inputs) # Decodăm și reinserăm formulele for j, translated_ids in enumerate(translated): translated_text = tokenizer.decode(translated_ids, skip_special_tokens=True) for formula in formulas_batch[j]: translated_text = translated_text.replace('###MATH###', formula, 1) translated_texts.append(translated_text) pbar.update(len(batch)) return translated_texts def render_math_formula(formula): """Randează o formulă matematică ca imagine folosind matplotlib""" fig, ax = plt.subplots(figsize=(len(formula) * 0.1 + 1, 0.5)) ax.text(0.5, 0.5, formula, fontsize=12, ha='center', va='center') ax.axis('off') img_buffer = BytesIO() plt.savefig(img_buffer, format='png', bbox_inches='tight', dpi=100, transparent=True) plt.close() img_buffer.seek(0) return img_buffer def text_to_pdf(text_data, translated_texts, output_path, page_sizes): """Convertește textul tradus în PDF cu formule randate""" try: pdfmetrics.registerFont(TTFont('DejaVuSans', 'DejaVuSans.ttf')) c = canvas.Canvas(output_path, pagesize=letter) current_page = 1 with tqdm(total=len(text_data), desc="Generare PDF") as pbar: for i, item in enumerate(text_data): if item['page'] != current_page: c.showPage() current_page = item['page'] c.setPageSize((item['width'], item['height'])) translated_text = translated_texts[i] y_pos = item['height'] - item['top'] # Verificăm dacă textul conține formule matematice if '$' in translated_text: img_buffer = render_math_formula(translated_text) text_width = c.stringWidth(translated_text, 'DejaVuSans', 12) c.drawImage(img_buffer, item['x0'], y_pos - 10, width=text_width, height=20, preserveAspectRatio=True) else: c.setFont('DejaVuSans', 12) original_width = item['x1'] - item['x0'] text_width = c.stringWidth(translated_text, 'DejaVuSans', 12) if text_width > original_width: font_size = 12 * (original_width / text_width) font_size = max(font_size, 6) c.setFont('DejaVuSans', font_size) c.drawString(item['x0'], y_pos, translated_text) pbar.update(1) c.save() except Exception as e: print(f"Eroare la crearea PDF-ului: {str(e)}") def translate_pdf(): """Procesează un PDF încărcat în Colab și descarcă rezultatul""" # Încărcăm fișierul PDF de pe calculator print("Încarcă un fișier PDF de pe calculator:") uploaded = files.upload() if not uploaded: print("Niciun fișier încărcat.") return pdf_path = list(uploaded.keys())[0] output_dir = "/content/output" os.makedirs(output_dir, exist_ok=True) # Extragem textul print("Încep procesarea...") text_data = extract_text_lines_with_position(pdf_path) if not text_data: return # Traducem liniile în loturi folosind GPU original_texts = [item['text'] for item in text_data] translated_texts = translate_text_batch(original_texts, batch_size=32) # Generăm PDF-ul tradus output_path = os.path.join(output_dir, f"{os.path.splitext(pdf_path)[0]}_translated.pdf") with pdfplumber.open(pdf_path) as pdf: page_sizes = [(page.width, page.height) for page in pdf.pages] text_to_pdf(text_data, translated_texts, output_path, page_sizes) print(f"Traducere completă! Descarcă fișierul: {output_path}") files.download(output_path) # Rulăm funcția principală translate_pdf()