#!/usr/bin/env node
/**
 * Build ovilus5_bank.txt: large contemporary English pool for NULLFIELD Ovilus-style mode.
 *
 * - Up to **30,000** frequency-ranked **single-word** types (dictionary-gated), in this order:
 *   1. `english_10k.txt` — Google **USA** 10k (no swears), frequency order (American register first).
 *   2. `norvig_count_1w.txt` — Peter Norvig / Google Books **1-gram** counts (~333k types, frequency order);
 *      we walk from the top, skipping already-seen words, until the **combined** frequency slice hits 30k.
 * - Then **all** given names, US places, and spoken phrases (no cap — “include everything” from these lists).
 *
 * Multi-word lines are only from `us_places_common.txt` and `spoken_phrases_us.txt` (curated).
 * Single words from frequency lists must appear in `words_alpha.txt`.
 *
 * Run: node nullfield/data/build_ovilus_bank.mjs
 *
 * Data refresh:
 *   curl -fsSL 'https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english-usa-no-swears.txt' \
 *     -o nullfield/data/english_10k.txt
 *   curl -fsSL 'https://norvig.com/ngrams/count_1w.txt' \
 *     -o nullfield/data/norvig_count_1w.txt
 */
import fs from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';

const __dirname = path.dirname(fileURLToPath(import.meta.url));
const WORDS_ALPHA = path.join(__dirname, 'words_alpha.txt');
const ENGLISH_USA_10K = path.join(__dirname, 'english_10k.txt');
const NORVIG_1W = path.join(__dirname, 'norvig_count_1w.txt');
const NAMES = path.join(__dirname, 'names_common.txt');
const PLACES = path.join(__dirname, 'us_places_common.txt');
const PHRASES = path.join(__dirname, 'spoken_phrases_us.txt');
const OUT = path.join(__dirname, 'ovilus5_bank.txt');

/** Target size for the frequency-ranked single-word slice (before names / places / phrases). */
const TOP_FREQ_SINGLE = 30000;
/** Hard safety cap for session pool in the app (see OVILUS5_STYLE_BANK in index.html). */
const OUTPUT_CAP = 50000;
const MAX_TOKEN_LEN = 72;

function loadLines(p) {
  return fs.readFileSync(p, 'utf8').split(/\r?\n/);
}

function loadDictSet() {
  const s = new Set();
  for (const line of loadLines(WORDS_ALPHA)) {
    const w = line.trim().toLowerCase();
    if (w) s.add(w);
  }
  return s;
}

function normLine(s) {
  return String(s || '')
    .trim()
    .toLowerCase()
    .replace(/\s+/g, ' ');
}

function isValidBankLine(s) {
  const t = normLine(s);
  if (!t || t.length > MAX_TOKEN_LEN) return false;
  return /^(?:[a-z0-9]+(?: [a-z0-9]+)*)$/.test(t);
}

function singleWordDictOk(w, dict) {
  if (/^\d$/.test(w)) return true;
  if (!/^[a-z]+$/.test(w)) return false;
  return dict.has(w);
}

function parseNorvigWord(line) {
  const t = line.trim();
  if (!t) return '';
  const tab = t.indexOf('\t');
  const w = (tab === -1 ? t : t.slice(0, tab)).trim().toLowerCase();
  return w;
}

function mergeUnique(dict) {
  const seen = new Set();
  const out = [];
  let freqAdded = 0;

  function pushDictWord(raw) {
    if (freqAdded >= TOP_FREQ_SINGLE) return;
    const t = normLine(raw);
    if (!isValidBankLine(t) || seen.has(t)) return;
    if (t.includes(' ')) return;
    if (!singleWordDictOk(t, dict)) return;
    seen.add(t);
    out.push(t);
    freqAdded++;
  }

  function pushName(raw) {
    const t = normLine(raw);
    if (!t || t.includes(' ') || !/^[a-z]{2,15}$/.test(t) || seen.has(t)) return;
    seen.add(t);
    out.push(t);
  }

  function pushCurated(raw) {
    const t = normLine(raw);
    if (!isValidBankLine(t) || seen.has(t)) return;
    seen.add(t);
    out.push(t);
  }

  if (fs.existsSync(ENGLISH_USA_10K)) {
    for (const line of loadLines(ENGLISH_USA_10K)) {
      if (freqAdded >= TOP_FREQ_SINGLE) break;
      pushDictWord(line);
    }
  }

  if (!fs.existsSync(NORVIG_1W)) {
    console.error('Missing', NORVIG_1W);
    process.exit(1);
  }
  const norvigBody = fs.readFileSync(NORVIG_1W, 'utf8');
  let lineStart = 0;
  const len = norvigBody.length;
  for (let i = 0; i <= len; i++) {
    if (freqAdded >= TOP_FREQ_SINGLE) break;
    if (i < len && norvigBody.charCodeAt(i) !== 10) continue;
    const line = norvigBody.slice(lineStart, i);
    lineStart = i + 1;
    const w = parseNorvigWord(line);
    if (!w || w.includes(' ') || !/^[a-z]+$/.test(w)) continue;
    pushDictWord(w);
  }

  if (fs.existsSync(NAMES)) {
    for (const line of loadLines(NAMES)) {
      pushName(line);
    }
  }

  if (fs.existsSync(PLACES)) {
    for (const line of loadLines(PLACES)) {
      pushCurated(line);
    }
  }

  if (fs.existsSync(PHRASES)) {
    for (const line of loadLines(PHRASES)) {
      pushCurated(line);
    }
  }

  return out;
}

function main() {
  if (!fs.existsSync(WORDS_ALPHA)) {
    console.error('Missing', WORDS_ALPHA);
    process.exit(1);
  }
  const dict = loadDictSet();
  const merged = mergeUnique(dict);

  if (merged.length < 2000) {
    console.error('Too few merged tokens:', merged.length);
    process.exit(1);
  }

  const capped = merged.length > OUTPUT_CAP ? merged.slice(0, OUTPUT_CAP) : merged;

  fs.writeFileSync(OUT, capped.join('\n') + '\n');
  console.log(
    JSON.stringify({
      freqTarget: TOP_FREQ_SINGLE,
      uniqueTotal: merged.length,
      written: capped.length,
      capped: merged.length > OUTPUT_CAP,
      outFile: OUT,
    })
  );
}

main();
