{
  "schema_id": "sovrient.pdf_text_normalization_spec",
  "schema_version": "1.0.0",
  "normalization_spec_id": "pdf_text_normalized_phrase_v1",
  "description": "Normalization steps applied before phrase matching against PDF-derived text surfaces.",
  "rules": [
    {
      "order": 1,
      "operation": "replace_unicode_quotes",
      "details": {
        "’": "'",
        "‘": "'",
        "“": "\"",
        "”": "\""
      }
    },
    {
      "order": 2,
      "operation": "replace_unicode_dashes",
      "details": {
        "–": "-",
        "—": "-"
      }
    },
    {
      "order": 3,
      "operation": "expand_common_ligatures",
      "details": {
        "ﬀ": "ff",
        "ﬁ": "fi",
        "ﬂ": "fl"
      }
    },
    {
      "order": 4,
      "operation": "remove_form_feed",
      "details": "Replace page break form-feed characters with a single space before phrase search."
    },
    {
      "order": 5,
      "operation": "join_linebreak_hyphenation",
      "details": "When a line ends with '-' and the next line begins with a lowercase letter, remove the hyphen and join the tokens."
    },
    {
      "order": 6,
      "operation": "collapse_whitespace",
      "details": "Collapse all runs of whitespace to a single ASCII space and trim edges."
    }
  ]
}
