diff --git a/assets/js/sql-builders.js b/assets/js/sql-builders.js index d4c1b50..833772c 100644 --- a/assets/js/sql-builders.js +++ b/assets/js/sql-builders.js @@ -39,3 +39,108 @@ export function textSearchScore(terms, weightedColumns) { ).join(' + '); }).map(score => `(${score})`).join(' + '); } + +// --------------------------------------------------------------------------- +// PID search helpers (issues #278 + #26) +// --------------------------------------------------------------------------- + +// Resolver-URL prefixes we strip before comparing (order matters: longest first). +// n2t.net and arks.org are ARK resolvers; doi.org handles DOIs; hdl.handle.net +// is the Handle System resolver. We only strip one prefix per call. +const RESOLVER_RE = /^https?:\/\/(n2t\.net|arks\.org|doi\.org|hdl\.handle\.net)\//i; + +// Canonicalize a PID for client-side comparison: +// 1. Lowercase the whole string. +// 2. Strip a leading resolver-URL prefix (n2t.net, doi.org, etc.). +// 3. Collapse classic-ARK `ark:/` → `ark:` (issue #26: modern vs classic ARK). +// The slash after the colon is not part of the ARK NAAN — both forms +// (`ark:/28722/…` and `ark:28722/…`) refer to the same identifier. +// IGSN, DOI, and other scheme-prefixed identifiers are lowercased but otherwise +// left intact so they match the stored values (which are also lowercased here). +export function canonicalizePid(value) { + let v = String(value).trim().toLowerCase(); + v = v.replace(RESOLVER_RE, ''); // strip resolver prefix if present + v = v.replace(/^ark:\//, 'ark:'); // collapse classic → modern ARK form + return v; +} + +// Heuristic: does this search term look like a PID rather than plain text? +// Returns true when the term carries an explicit PID scheme or resolver URL, +// so we can route it through PID-aware matching without changing plain-text +// search behaviour. +// +// Covered cases: +// pid:… — explicit escape hatch: scheme-agnostic substring match on pid col +// ark:… — both classic (ark:/) and modern (ark:) forms +// igsn:… — SESAR-style identifiers +// doi:… — DOI scheme +// 10.… — bare DOI (starts with "10." as DataCite convention) +// https?://n2t.net/…, https?://doi.org/…, etc. — resolver URLs +export function looksLikePid(term) { + const t = String(term).trim().toLowerCase(); + return ( + t.startsWith('pid:') || // explicit escape hatch (see pidSearchWhere) + t.startsWith('ark:') || + t.startsWith('igsn:') || + t.startsWith('doi:') || + /^10\./.test(t) || // bare DOI like 10.5281/zenodo.123 + RESOLVER_RE.test(t) // resolver URL + ); +} + +// Build a SQL predicate fragment for PID matching. +// +// Two code paths: +// +// 1. `pid:` prefix (scheme-agnostic escape hatch) — user typed e.g. `pid:IEGIL000C` +// or `pid:k2000027w` to find a sample by a bare fragment without knowing the +// scheme. Emits a single ILIKE substring match against the pid column: +// pid ILIKE '%%' ESCAPE '\' +// DuckDB's ILIKE is already case-insensitive so no LOWER is needed. The +// remainder after "pid:" is passed through escapeIlikePattern for safety. +// The canonical exact-match arm is intentionally skipped — the substring +// already spans all scheme variants. +// +// 2. Scheme-bearing / resolver-URL terms (ark:, igsn:, doi:, 10., https://…) — +// two-sided normalisation so stored format doesn't matter: +// A. Exact-match: LOWER(REPLACE(pid, 'ark:/', 'ark:')) = '' +// Handles stored-side ARK-slash collapse and case normalisation. +// No resolver-URL prefix in stored data, so only REPLACE+LOWER needed. +// B. Local-part fallback: pid ILIKE '%%' ESCAPE '\' +// The part after the last '/' (or the whole canonical if no '/') +// catches bare local identifiers that coincide with the query. +// Both predicates OR-ed. +// +// All user input passes through escSql / escapeIlikePattern — no raw interpolation. +export function pidSearchWhere(rawTerm) { + const trimmed = String(rawTerm).trim(); + + // --- Path 1: pid: escape hatch --- + if (trimmed.toLowerCase().startsWith('pid:')) { + const fragment = trimmed.slice(4); // strip the "pid:" prefix (any case) + const fragEsc = escapeIlikePattern(fragment); + // Single substring ILIKE — no scheme assumption, ILIKE is case-insensitive. + return `pid ILIKE '%${fragEsc}%' ESCAPE '\\'`; + } + + // --- Path 2: scheme-bearing / resolver-URL term --- + const canonical = canonicalizePid(trimmed); + // Safe interpolation via escSql (no raw user input in the SQL string). + const canonEsc = escSql(canonical); + + // Stored-side normalisation in SQL: lowercase + collapse ark:/ → ark: + // (DuckDB's REPLACE is case-sensitive on the search string, so we LOWER + // first, then replace the already-lowercased prefix.) + const storedNorm = `LOWER(REPLACE(pid, 'ark:/', 'ark:'))`; + + // Local-part fallback: strip everything up to and including the last '/' + // in the *canonical* form, leaving the bare local identifier. + const slashIdx = canonical.lastIndexOf('/'); + const localPart = slashIdx >= 0 ? canonical.slice(slashIdx + 1) : canonical; + const localEsc = escapeIlikePattern(localPart); + + // Combine: exact normalised match OR bare-localpart substring match. + // The substring match is deliberately narrow (must appear somewhere in pid) + // so it doesn't produce false hits on label/description columns. + return `(${storedNorm} = '${canonEsc}' OR pid ILIKE '%${localEsc}%' ESCAPE '\\')`; +} diff --git a/explorer.qmd b/explorer.qmd index a117c9e..40a5330 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -858,6 +858,9 @@ escSql = _sqlBuilders.escSql escapeIlikePattern = _sqlBuilders.escapeIlikePattern textSearchWhere = _sqlBuilders.textSearchWhere textSearchScore = _sqlBuilders.textSearchScore +canonicalizePid = _sqlBuilders.canonicalizePid +looksLikePid = _sqlBuilders.looksLikePid +pidSearchWhere = _sqlBuilders.pidSearchWhere _explorerUtils = await import(new URL('assets/js/explorer-utils.js', document.baseURI).href) escapeHtml = _explorerUtils.escapeHtml @@ -5379,6 +5382,22 @@ zoomWatcher = { { col: 'description', weight: 1 }, { col: 'CAST(place_name AS VARCHAR)', weight: 2 }, ]); + + // #278/#26 — PID-aware search: when any term looks like a persistent + // identifier (ARK, IGSN, DOI, resolver URL), OR a normalised PID + // predicate into the WHERE so the sample is found even though its pid + // doesn't appear in label/description/place_name. Plain-text terms + // (e.g. "pottery") are unaffected — no pid column is touched for them, + // preserving the existing hot-path performance. + const pidPredicates = terms + .filter(t => looksLikePid(t)) + .map(t => pidSearchWhere(t)); + // Combine: original text-field match OR any pid predicate. + // If no terms look like PIDs, fullWhere equals searchWhere exactly. + const fullWhere = pidPredicates.length + ? `(${searchWhere} OR ${pidPredicates.join(' OR ')})` + : searchWhere; + // pid is unique in facets_url (verified in A1 scoping), so the build // is naturally one row per pid — no DISTINCT needed. (The previous // `SELECT DISTINCT pid` deduped a single column; a 5-column DISTINCT @@ -5397,7 +5416,7 @@ zoomWatcher = { CREATE OR REPLACE TABLE ${staging} AS SELECT pid, label, source, place_name, (${score}) AS relevance_score FROM read_parquet('${facets_url}') - WHERE pid IS NOT NULL AND ${searchWhere} + WHERE pid IS NOT NULL AND ${fullWhere} `); if (token !== _searchFilterToken) return false; // superseded mid-build await db.query(`CREATE OR REPLACE TABLE search_pids AS diff --git a/tests/unit/sql-builders-pid.test.mjs b/tests/unit/sql-builders-pid.test.mjs new file mode 100644 index 0000000..fa8be1f --- /dev/null +++ b/tests/unit/sql-builders-pid.test.mjs @@ -0,0 +1,216 @@ +// Unit tests for PID-search helpers added to assets/js/sql-builders.js +// (issues #278 search-by-PID, #26 modern-vs-classic ARK). +// Run: node --test tests/unit/ (Node built-ins only, no install) +import { test } from 'node:test'; +import assert from 'node:assert/strict'; +import { + canonicalizePid, looksLikePid, pidSearchWhere, +} from '../../assets/js/sql-builders.js'; + +// --------------------------------------------------------------------------- +// canonicalizePid +// --------------------------------------------------------------------------- + +test('canonicalizePid: classic ARK (ark:/) → modern form (ark:)', () => { + assert.equal(canonicalizePid('ark:/28722/k2000027w'), 'ark:28722/k2000027w'); +}); + +test('canonicalizePid: modern ARK already canonical — no change', () => { + assert.equal(canonicalizePid('ark:28722/k2000027w'), 'ark:28722/k2000027w'); +}); + +test('canonicalizePid: resolver URL stripped then ARK collapsed', () => { + assert.equal( + canonicalizePid('https://n2t.net/ark:/28722/k2000027w'), + 'ark:28722/k2000027w' + ); + assert.equal( + canonicalizePid('http://n2t.net/ark:28722/k2000027w'), + 'ark:28722/k2000027w' + ); + // arks.org resolver + assert.equal( + canonicalizePid('https://arks.org/ark:/28722/k2000027w'), + 'ark:28722/k2000027w' + ); +}); + +test('canonicalizePid: IGSN with prefix — lowercased, otherwise unchanged', () => { + assert.equal(canonicalizePid('IGSN:IEGIL000C'), 'igsn:iegil000c'); +}); + +test('canonicalizePid: IGSN lowercase already', () => { + assert.equal(canonicalizePid('igsn:iegil000c'), 'igsn:iegil000c'); +}); + +test('canonicalizePid: bare local identifier passed through lowercased', () => { + // No scheme prefix — just lowercase; caller handles fallback matching. + assert.equal(canonicalizePid('IEGIL000C'), 'iegil000c'); +}); + +test('canonicalizePid: DOI resolver URL stripped', () => { + assert.equal( + canonicalizePid('https://doi.org/10.5281/zenodo.123'), + '10.5281/zenodo.123' + ); +}); + +test('canonicalizePid: handle.net resolver URL stripped', () => { + assert.equal( + canonicalizePid('https://hdl.handle.net/20.500.12535/abc'), + '20.500.12535/abc' + ); +}); + +test('canonicalizePid: trims surrounding whitespace', () => { + assert.equal(canonicalizePid(' IGSN:IEGIL000C '), 'igsn:iegil000c'); +}); + +// --------------------------------------------------------------------------- +// looksLikePid +// --------------------------------------------------------------------------- + +test('looksLikePid: classic ARK', () => { + assert.equal(looksLikePid('ark:/28722/k2000027w'), true); +}); + +test('looksLikePid: modern ARK', () => { + assert.equal(looksLikePid('ark:28722/k2000027w'), true); +}); + +test('looksLikePid: IGSN with prefix', () => { + assert.equal(looksLikePid('IGSN:IEGIL000C'), true); + assert.equal(looksLikePid('igsn:iegil000c'), true); +}); + +test('looksLikePid: DOI scheme', () => { + assert.equal(looksLikePid('doi:10.5281/zenodo.123'), true); +}); + +test('looksLikePid: bare DOI (starts with 10.)', () => { + assert.equal(looksLikePid('10.5281/zenodo.123'), true); +}); + +test('looksLikePid: resolver URL', () => { + assert.equal(looksLikePid('https://n2t.net/ark:/28722/k2000027w'), true); + assert.equal(looksLikePid('https://doi.org/10.5281/zenodo.123'), true); +}); + +test('looksLikePid: plain text is NOT a PID', () => { + assert.equal(looksLikePid('pottery'), false); + assert.equal(looksLikePid('archaeological site'), false); + assert.equal(looksLikePid('basalt'), false); +}); + +test('looksLikePid: bare local identifier without scheme is NOT detected', () => { + // A bare local part like IEGIL000C has no scheme — the user must include + // "igsn:" for the heuristic to fire. This is intentional: bare words + // might be meaningful text and we don't want to route them via pid-search. + assert.equal(looksLikePid('IEGIL000C'), false); + // However, the pidSearchWhere localpart fallback still catches it when + // the caller pairs it with a fully-prefixed term, or when the caller + // routes it deliberately. See the integration comment in explorer.qmd. +}); + +// --------------------------------------------------------------------------- +// pidSearchWhere — SQL fragment shape and injection safety +// --------------------------------------------------------------------------- + +test('pidSearchWhere: classic ARK produces normalised equality + localpart ILIKE', () => { + const sql = pidSearchWhere('ark:/28722/k2000027w'); + // Stored-side normalisation: + assert.ok(sql.includes("LOWER(REPLACE(pid, 'ark:/', 'ark:'))"), 'stored-side LOWER+REPLACE present'); + // Canonical form (ark: not ark:/) in the equality comparison: + assert.ok(sql.includes("= 'ark:28722/k2000027w'"), 'canonical ARK in equality'); + // Localpart fallback (the part after the last '/'): + assert.ok(sql.includes("pid ILIKE '%k2000027w%'"), 'localpart ILIKE fallback'); + // Both sides wrapped in outer parens: + assert.ok(sql.startsWith('(') && sql.endsWith(')'), 'outer parens'); +}); + +test('pidSearchWhere: IGSN with prefix', () => { + const sql = pidSearchWhere('IGSN:IEGIL000C'); + assert.ok(sql.includes("= 'igsn:iegil000c'"), 'canonical IGSN equality'); + // No '/' in the IGSN value, so localpart equals the full canonical form. + // The fallback ILIKE therefore matches the whole canonical pid substring. + assert.ok(sql.includes("pid ILIKE '%igsn:iegil000c%'"), 'IGSN full-canonical ILIKE fallback'); +}); + +test('pidSearchWhere: resolver URL (n2t.net) — prefix stripped in canonical', () => { + const sql = pidSearchWhere('https://n2t.net/ark:/28722/k2000027w'); + assert.ok(sql.includes("= 'ark:28722/k2000027w'"), 'resolver prefix stripped in canonical'); + assert.ok(sql.includes("pid ILIKE '%k2000027w%'"), 'localpart fallback'); +}); + +test('pidSearchWhere: injection-safe — single quotes escaped', () => { + // A malicious value with a single quote must not break the SQL string. + // canonicalizePid lowercases first, so "O'Malley" → "o'malley" → "o''malley". + const sql = pidSearchWhere("ark:/99999/O'Malley"); + assert.ok(!sql.includes("'o'malley'"), 'raw unescaped single quote not present in canonical'); + assert.ok(sql.includes("o''malley"), 'single quote properly doubled (after lowercasing)'); +}); + +test('pidSearchWhere: injection-safe — LIKE metacharacters escaped in localpart', () => { + const sql = pidSearchWhere('igsn:test_50%boom'); + // _ and % must be backslash-escaped in the ILIKE pattern + assert.ok(sql.includes('\\_'), 'underscore escaped'); + assert.ok(sql.includes('\\%'), 'percent escaped'); +}); + +// --------------------------------------------------------------------------- +// pid: escape hatch (scheme-agnostic substring search) +// --------------------------------------------------------------------------- + +test('looksLikePid: pid: prefix is recognised', () => { + assert.equal(looksLikePid('pid:IEGIL000C'), true); + assert.equal(looksLikePid('pid:k2000027w'), true); + // case-insensitive prefix + assert.equal(looksLikePid('PID:foo'), true); + assert.equal(looksLikePid('Pid:foo'), true); +}); + +test('pidSearchWhere: pid: prefix — emits bare ILIKE, no canonical exact-match arm', () => { + // Verified against production: pid:IEGIL000C → 1 row (IGSN:IEGIL000C) + const sql = pidSearchWhere('pid:IEGIL000C'); + // Fragment is preserved as-is (no lowercasing) — ILIKE is case-insensitive + // so the match still works regardless of stored case. + assert.equal(sql, "pid ILIKE '%IEGIL000C%' ESCAPE '\\'"); + // No LOWER(REPLACE(...)) canonical arm present. + assert.ok(!sql.includes('LOWER(REPLACE'), 'no canonical arm for pid: terms'); +}); + +test('pidSearchWhere: pid: prefix — localpart without scheme (k2000027w)', () => { + // Verified against production: pid:k2000027w → 1 row (ark:/28722/k2000027w) + const sql = pidSearchWhere('pid:k2000027w'); + assert.equal(sql, "pid ILIKE '%k2000027w%' ESCAPE '\\'"); +}); + +test('pidSearchWhere: pid: prefix — case-insensitive prefix strip (PID:)', () => { + // The "pid:" prefix is stripped regardless of case; remainder is lowercased + // by escapeIlikePattern → escSql (no lowercasing in the pid: path, but + // ILIKE is case-insensitive so the fragment matches regardless). + const sql = pidSearchWhere('PID:IEGIL000C'); + // Fragment is kept as-is through escaping (ILIKE handles case). + assert.equal(sql, "pid ILIKE '%IEGIL000C%' ESCAPE '\\'"); +}); + +test('pidSearchWhere: pid: prefix — injection-safe (single quote in fragment)', () => { + const sql = pidSearchWhere("pid:O'Brien"); + // Single quote must be doubled; no raw quote in output. + assert.ok(!sql.includes("'O'Brien'"), 'raw unescaped single quote absent'); + assert.ok(sql.includes("O''Brien"), 'single quote doubled'); +}); + +test('pidSearchWhere: pid: prefix — injection-safe (LIKE metacharacters in fragment)', () => { + const sql = pidSearchWhere('pid:test_50%boom'); + assert.ok(sql.includes('\\_'), 'underscore escaped'); + assert.ok(sql.includes('\\%'), 'percent escaped'); +}); + +test('Option A: bare local identifiers (no scheme) are NOT routed via looksLikePid', () => { + // This confirms the default behaviour is unchanged — only scheme-prefixed or + // pid:-prefixed terms get PID matching. Plain words go through text search. + assert.equal(looksLikePid('IEGIL000C'), false); + assert.equal(looksLikePid('vdm_19600211'), false); + assert.equal(looksLikePid('pottery'), false); +});