#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
randomness_report_cli.py
Analiză avansată a aleatorității pentru fișiere mari de biți (0/1) și raport .txt în limba română.

Teste incluse (subset avansat):
 1) Entropie Shannon
 2) Monobit (chi-square, p-value)         [scipy pentru p]
 3) Runs test (z, p)                      [scipy pentru p]
 4) Block-frequency (chi-square, p)       [scipy pentru p]
 5) Serial (pattern freq, m variabil)     [scipy pentru p]
 6) Approximate Entropy (ApEn, m=2)
 7) Lempel–Ziv complexity (estimare)
 8) Longest run of ones
 9) Autocorelație (max lag)
10) FFT (căutare vârfuri/periodicități)
11) CUSUM (z, p)                          [scipy pentru p]
12) Compresibilitate (zlib) ca proxy Maurer-Universal

Dependențe:
 - Necesită Python 3.8+.
 - Recomandat: scipy (pentru p-value). Fără scipy, p-value-urile nu vor fi calculate.

Exemple:
  python3 randomness_report_cli.py --input "datecriptate.txt" --out raport_randomness.txt
  python3 randomness_report_cli.py --input "datebinare.txt" --max-bits 1000000 --block 128 --m 4 --lag 1024
"""

import argparse, os, math, time, zlib
from collections import Counter

try:
    import numpy as np
except Exception as e:
    raise SystemExit("Acest script necesită numpy. Instalați: pip install numpy")

# scipy este opțional, dar recomandat pentru p-value
_have_scipy = True
try:
    from scipy import stats
except Exception:
    _have_scipy = False

# -------------------- citire streaming --------------------
def read_bitfile_stream(path, max_bits=None):
    """Citește fișier text de 0/1; returnează numpy array uint8.
       Dacă max_bits este setat, se oprește după max_bits biți."""
    bits = []
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            line = line.strip()
            for ch in line:
                if ch == '0' or ch == '1':
                    bits.append(1 if ch == '1' else 0)
                    if max_bits and len(bits) >= max_bits:
                        return np.frombuffer(bytearray(bits), dtype=np.uint8)
    return np.frombuffer(bytearray(bits), dtype=np.uint8)

# -------------------- teste --------------------
def shannon_entropy(bits):
    if len(bits) == 0:
        return 0.0
    p1 = float(bits.mean())
    p0 = 1.0 - p1
    ent = 0.0
    if p0 > 0: ent -= p0 * math.log2(p0)
    if p1 > 0: ent -= p1 * math.log2(p1)
    return ent

def monobit_test(bits):
    n = len(bits)
    ones = int(bits.sum())
    zeros = n - ones
    expected = n / 2.0
    chi2 = 0.0
    if expected > 0:
        chi2 = ((ones - expected) ** 2) / expected + ((zeros - expected) ** 2) / expected
    if _have_scipy:
        p = 1 - stats.chi2.cdf(chi2, df=1)
    else:
        p = None
    return chi2, p, ones, zeros

def runs_test(bits):
    n = len(bits)
    pi = float(bits.mean())
    # prerechizit: pi aprox 0.5
    if _have_scipy:
        thresh = 2 / math.sqrt(n) if n > 0 else 1.0
        if abs(pi - 0.5) > thresh:
            return {'ok': False, 'reason': 'Proporţia de 1 nu este aproape de 0.5.'}
    runs = 1 + int(np.sum(bits[1:] != bits[:-1])) if n > 1 else int(n > 0)
    expected = 2 * n * pi * (1 - pi) + 1
    var = 0.0
    if n > 1:
        var = (2 * n * pi * (1 - pi) * (2 * n * pi * (1 - pi) - 1)) / (n - 1)
    if var <= 0 or not _have_scipy:
        return {'runs': runs, 'expected': expected, 'z': float('nan'), 'p': None, 'ok': True}
    z = (runs - expected) / math.sqrt(var)
    p = 2 * (1 - stats.norm.cdf(abs(z)))
    return {'runs': runs, 'expected': expected, 'z': z, 'p': p, 'ok': (p is None) or (p >= 0.01)}

def block_frequency(bits, block_size=128):
    n = len(bits)
    nb = n // block_size
    if nb == 0:
        raise ValueError("Bloc prea mare pentru lungimea secvenței.")
    props = []
    for i in range(nb):
        blk = bits[i * block_size:(i + 1) * block_size]
        props.append(np.sum(blk) / block_size)
    props = np.array(props, dtype=float)
    stat = float(4 * block_size * np.sum((props - 0.5) ** 2))
    if _have_scipy:
        p = 1 - stats.chi2.cdf(stat, df=nb - 1)
    else:
        p = None
    return {'stat': stat, 'p': p, 'nb_blocks': nb, 'mean_prop': float(props.mean())}

def serial_test(bits, m=4):
    n = len(bits)
    if n < m + 1:
        raise ValueError("Secvență prea scurtă pentru m ales.")
    total = n - m + 1
    s = ''.join('1' if x else '0' for x in bits)
    counts = Counter(s[i:i + m] for i in range(total))
    expected = total / (2 ** m)
    chi2 = sum((c - expected) ** 2 / expected for c in counts.values())
    if _have_scipy:
        p = 1 - stats.chi2.cdf(chi2, df=(2 ** m - 1))
    else:
        p = None
    return {'m': m, 'chi2': float(chi2), 'p': p, 'distinct': len(counts)}

def approximate_entropy(bits, m=2):
    n = len(bits)
    if n < m + 2:
        return float('nan')
    def phi(mm):
        total = n - mm + 1
        s = ''.join('1' if x else '0' for x in bits)
        cnt = Counter(s[i:i + mm] for i in range(total))
        summ = 0.0
        for c in cnt.values():
            p = c / total
            summ += p * math.log(p)
        return summ / total
    try:
        return float(-(phi(m + 1) - phi(m)))
    except Exception:
        return float('nan')

def lz_complexity(bits):
    s = ''.join('1' if x else '0' for x in bits)
    i = 0; n = len(s); phrases = set(); count = 0
    while i < n:
        l = 1
        while i + l <= n and s[i:i + l] in phrases:
            l += 1
        phrases.add(s[i:i + l])
        count += 1
        i += l
    return int(count)

def longest_run_ones(bits):
    maxrun = 0; cur = 0
    for b in bits:
        if b == 1:
            cur += 1
            if cur > maxrun: maxrun = cur
        else:
            cur = 0
    return int(maxrun)

def autocorr(bits, maxlag=1024):
    n = len(bits)
    if n < 2:
        return np.zeros(0, dtype=float)
    x = bits.astype(float) - float(bits.mean())
    var = float(np.var(x))
    if var == 0.0:
        return np.zeros(0, dtype=float)
    ac = [float(np.sum(x[:-lag] * x[lag:]) / (n - lag) / var) for lag in range(1, min(maxlag, n - 1) + 1)]
    return np.array(ac, dtype=float)

def fft_peaks(bits, threshold_std=6, max_peaks=50):
    x = 2 * bits.astype(float) - 1.0
    n = len(x)
    if n == 0:
        return []
    X = np.fft.rfft(x)
    mag = np.abs(X)
    thresh = float(mag.mean() + threshold_std * mag.std())
    peaks = [(float(np.fft.rfftfreq(n)[i]), float(mag[i])) for i in range(1, len(mag)) if mag[i] > thresh]
    peaks.sort(key=lambda t: t[1], reverse=True)
    return peaks[:max_peaks]

def cusum(bits):
    s = 2 * bits.astype(int) - 1
    csum = np.cumsum(s)
    z = float(np.max(np.abs(csum)) / math.sqrt(len(s))) if len(s) > 0 else float('nan')
    if _have_scipy and not math.isnan(z):
        p = float(2 * (1 - stats.norm.cdf(abs(z))))
    else:
        p = None
    return {'z': z, 'p': p, 'max_abs': int(np.max(np.abs(csum))) if len(s) > 0 else 0}

def compression_ratio(bits):
    n = len(bits)
    if n < 8:
        return 1.0
    b = bytearray()
    stop = n - (n % 8)
    for i in range(0, stop, 8):
        byte = 0
        for j in range(8):
            byte = (byte << 1) | int(bits[i + j])
        b.append(byte)
    comp = zlib.compress(bytes(b))
    return float(len(comp) / len(b)) if len(b) > 0 else 1.0

# -------------------- raport --------------------
def verdict_from_p(p, alpha=0.01):
    if p is None:
        return "n/a (fără scipy)"
    return "trece" if p >= alpha else "respins"

def short_explain(testname, result):
    if testname == 'entropy':
        ent = result
        if ent > 0.999:
            return f"Entropie Shannon = {ent:.6f} (foarte aproape de 1 → comportament aproape perfect aleator)"
        elif ent > 0.99:
            return f"Entropie Shannon = {ent:.6f} (înaltă, mică deviere de aleator)"
        else:
            return f"Entropie Shannon = {ent:.6f} (posibilă structură; nu e complet aleator)"

    if testname == 'monobit':
        chi2,p,ones,zeros = result
        v = verdict_from_p(p)
        return f"Monobit: ones={ones}, zeros={zeros}, chi2={chi2:.4f}, p={p:.3e} → verdict: {v} (alpha=0.01)"

    if testname == 'runs':
        if not result.get('ok', True):
            return "Runs: precondiție eșuată; proporția de 1 nu e aproape de 0.5."
        p = result.get('p', None)
        v = verdict_from_p(p) if p is not None else "n/a"
        z = result.get('z', float('nan'))
        return f"Runs: runs={result['runs']}, z={z:.3f}, p={p:.3e} → verdict: {v}"

    if testname == 'block':
        p = result.get('p', None)
        v = verdict_from_p(p)
        return f"Block-frequency: stat={result['stat']:.3f}, p={p:.3e}, blocuri={result['nb_blocks']} → verdict: {v}"

    if testname == 'serial':
        p = result.get('p', None)
        v = verdict_from_p(p)
        return f"Serial (m={result['m']}): chi2={result['chi2']:.3f}, p={p:.3e} → verdict: {v}"

    if testname == 'apen':
        return f"Approximate Entropy (m=2): {result:.6f} (valori mici sugerează predictibilitate)"

    if testname == 'lz':
        return f"Lempel-Ziv complexity (est): {result} (valori mari → complexitate mare)"

    if testname == 'longest_run':
        return f"Cel mai lung șir de 1 consec: {result} biți (comparați cu valorile așteptate NIST pentru n)"

    if testname == 'autocorr':
        ac = result
        high = float(np.max(np.abs(ac))) if len(ac) > 0 else 0.0
        return f"Autocorelație maximă (prime laguri): {high:.6f} (valori mari pot indica periodicitate)"

    if testname == 'fft':
        peaks = result
        return f"FFT: {len(peaks)} vârfuri semnificative detectate (dacă >0 → periodicități posibile)"

    if testname == 'cusum':
        p = result.get('p', None)
        v = verdict_from_p(p)
        return f"CUSUM z={result['z']:.4f}, p~{(0.0 if p is None else p):.3e} (verdict: {v})"

    if testname == 'compress':
        return f"Raport compresie (zlib) = {result:.6f} (valori apropiate de 1 → dificil de comprimat → mai aleator)"

    return ""

# -------------------- main --------------------
def main():
    ap = argparse.ArgumentParser(description="Analiză avansată aleatoritate și raport .txt (română)")
    ap.add_argument("--input", "-i", default="datecriptate.txt", help="Fișier de intrare (0/1). Implicit: datecriptate.txt")
    ap.add_argument("--out", "-o", default=None, help="Fișier raport .txt (implicit: <input>_raport.txt)")
    ap.add_argument("--max-bits", type=int, default=0, help="Număr maxim de biți de analizat (0 = toți)")
    ap.add_argument("--block", type=int, default=128, help="Block size pentru block-frequency (ex: 128)")
    ap.add_argument("--m", type=int, default=4, help="Lungimea pattern-ului pentru Serial test (ex: 4)")
    ap.add_argument("--lag", type=int, default=1024, help="Max lag pentru autocorelație (ex: 1024)")
    args = ap.parse_args()

    if not os.path.exists(args.input):
        raise SystemExit(f"Eroare: fișierul nu există: {args.input}")

    out_path = args.out or (os.path.splitext(args.input)[0] + "_raport.txt")
    t0 = time.time()

    nmax = args.max_bits if args.max_bits > 0 else None
    bits = read_bitfile_stream(args.input, max_bits=nmax)
    n = len(bits)
    if n == 0:
        raise SystemExit("Eroare: fișier gol sau conținut invalid (trebuie doar 0/1).")

    lines = []
    def add(s): lines.append(s)

    add(f"Încep analiza pe fișier: {os.path.abspath(args.input)} (max biți={'toate' if nmax is None else nmax})")
    add(f"Biți citiți: {n}")

    # teste
    ent = shannon_entropy(bits);                add(short_explain('entropy', ent))
    chi2,p,ones,zeros = monobit_test(bits);     add(short_explain('monobit', (chi2,p,ones,zeros)))
    runs = runs_test(bits);                     add(short_explain('runs', runs))

    try:
        bf = block_frequency(bits, block_size=args.block)
        add(short_explain('block', bf))
    except Exception as e:
        bf = None
        add(f"Block-frequency: eroare {e}")

    try:
        ser = serial_test(bits, m=args.m)
        add(short_explain('serial', ser))
    except Exception as e:
        ser = None
        add(f"Serial test: eroare {e}")

    apen = approximate_entropy(bits, m=2);      add(short_explain('apen', apen))
    lz   = lz_complexity(bits);                 add(short_explain('lz', lz))
    lr   = longest_run_ones(bits);              add(short_explain('longest_run', lr))
    ac   = autocorr(bits, maxlag=args.lag);     add(short_explain('autocorr', ac))
    peaks= fft_peaks(bits);                     add(short_explain('fft', peaks))
    cus  = cusum(bits);                         add(short_explain('cusum', cus))
    cr   = compression_ratio(bits);             add(short_explain('compress', cr))

    # concluzii sintetice
    add("\n=== CONCLUZII SINTETICE ===")
    if ent >= 0.995:
        add("1) Entropia este foarte înaltă — secvența se comportă extrem de aproape de aleator real.")
    elif ent >= 0.98:
        add("1) Entropia e înaltă — mici deviații posibile, dar în general pare aleator.")
    else:
        add("1) Entropia e sub așteptări — există indici de structură sau corelare.")

    problematic = []
    if p is not None and p < 0.01:
        problematic.append("testul monobit")
    if isinstance(runs, dict) and runs.get('p', 1) not in (None,) and runs.get('p', 1) < 0.01:
        problematic.append("testul runurilor")
    if bf and bf.get('p', None) is not None and bf['p'] < 0.01:
        problematic.append("testul block-frequency")
    if ser and ser.get('p', None) is not None and ser['p'] < 0.01:
        problematic.append(f"testul serial (m={ser['m']})")
    if len(peaks) > 0:
        problematic.append("spectrul FFT sugerează periodicități")

    if not problematic:
        add("2) Toate testele statistice folosite în subsetul avansat nu au evidențiat anomalii semnificative (alpha=0.01).")
    else:
        add("2) Atenție: următoarele teste au indicat posibile abateri: " + "; ".join(problematic))
        add("   Recomandare: analiza pe secțiuni, verificarea canalelor de inserție, testare contra unei copii independente.")

    add(f"3) Compresibilitate (zlib) = {cr:.6f} → {'improbabil de comprimat' if cr>0.95 else 'posibil comprimat'}")
    add("4) Rezultatele sunt statistice: o singură p-valoare sub prag nu dovedește cu certitudine o inserție.")
    add(f"Timp total analiză: {time.time() - t0:.2f}s")

    report = "\n".join(lines)
    # tipărește la consolă
    print(report)
    # salvează raportul
    with open(out_path, "w", encoding="utf-8") as f:
        f.write(report + "\n")
    print(f"\nRaport salvat în: {os.path.abspath(out_path)}")

if __name__ == "__main__":
    main()