Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 105 additions & 0 deletions assets/js/sql-builders.js
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,108 @@ export function textSearchScore(terms, weightedColumns) {
).join(' + ');
}).map(score => `(${score})`).join(' + ');
}

// ---------------------------------------------------------------------------
// PID search helpers (issues #278 + #26)
// ---------------------------------------------------------------------------

// Resolver-URL prefixes we strip before comparing (order matters: longest first).
// n2t.net and arks.org are ARK resolvers; doi.org handles DOIs; hdl.handle.net
// is the Handle System resolver. We only strip one prefix per call.
const RESOLVER_RE = /^https?:\/\/(n2t\.net|arks\.org|doi\.org|hdl\.handle\.net)\//i;

// Canonicalize a PID for client-side comparison:
// 1. Lowercase the whole string.
// 2. Strip a leading resolver-URL prefix (n2t.net, doi.org, etc.).
// 3. Collapse classic-ARK `ark:/` → `ark:` (issue #26: modern vs classic ARK).
// The slash after the colon is not part of the ARK NAAN — both forms
// (`ark:/28722/…` and `ark:28722/…`) refer to the same identifier.
// IGSN, DOI, and other scheme-prefixed identifiers are lowercased but otherwise
// left intact so they match the stored values (which are also lowercased here).
export function canonicalizePid(value) {
let v = String(value).trim().toLowerCase();
v = v.replace(RESOLVER_RE, ''); // strip resolver prefix if present
v = v.replace(/^ark:\//, 'ark:'); // collapse classic → modern ARK form
return v;
}

// Heuristic: does this search term look like a PID rather than plain text?
// Returns true when the term carries an explicit PID scheme or resolver URL,
// so we can route it through PID-aware matching without changing plain-text
// search behaviour.
//
// Covered cases:
// pid:… — explicit escape hatch: scheme-agnostic substring match on pid col
// ark:… — both classic (ark:/) and modern (ark:) forms
// igsn:… — SESAR-style identifiers
// doi:… — DOI scheme
// 10.… — bare DOI (starts with "10." as DataCite convention)
// https?://n2t.net/…, https?://doi.org/…, etc. — resolver URLs
export function looksLikePid(term) {
const t = String(term).trim().toLowerCase();
return (
t.startsWith('pid:') || // explicit escape hatch (see pidSearchWhere)
t.startsWith('ark:') ||
t.startsWith('igsn:') ||
t.startsWith('doi:') ||
/^10\./.test(t) || // bare DOI like 10.5281/zenodo.123
RESOLVER_RE.test(t) // resolver URL
);
}

// Build a SQL predicate fragment for PID matching.
//
// Two code paths:
//
// 1. `pid:` prefix (scheme-agnostic escape hatch) — user typed e.g. `pid:IEGIL000C`
// or `pid:k2000027w` to find a sample by a bare fragment without knowing the
// scheme. Emits a single ILIKE substring match against the pid column:
// pid ILIKE '%<fragment>%' ESCAPE '\'
// DuckDB's ILIKE is already case-insensitive so no LOWER is needed. The
// remainder after "pid:" is passed through escapeIlikePattern for safety.
// The canonical exact-match arm is intentionally skipped — the substring
// already spans all scheme variants.
//
// 2. Scheme-bearing / resolver-URL terms (ark:, igsn:, doi:, 10., https://…) —
// two-sided normalisation so stored format doesn't matter:
// A. Exact-match: LOWER(REPLACE(pid, 'ark:/', 'ark:')) = '<canonical>'
// Handles stored-side ARK-slash collapse and case normalisation.
// No resolver-URL prefix in stored data, so only REPLACE+LOWER needed.
// B. Local-part fallback: pid ILIKE '%<localpart>%' ESCAPE '\'
// The part after the last '/' (or the whole canonical if no '/')
// catches bare local identifiers that coincide with the query.
// Both predicates OR-ed.
//
// All user input passes through escSql / escapeIlikePattern — no raw interpolation.
export function pidSearchWhere(rawTerm) {
const trimmed = String(rawTerm).trim();

// --- Path 1: pid: escape hatch ---
if (trimmed.toLowerCase().startsWith('pid:')) {
const fragment = trimmed.slice(4); // strip the "pid:" prefix (any case)
const fragEsc = escapeIlikePattern(fragment);
// Single substring ILIKE — no scheme assumption, ILIKE is case-insensitive.
return `pid ILIKE '%${fragEsc}%' ESCAPE '\\'`;
}

// --- Path 2: scheme-bearing / resolver-URL term ---
const canonical = canonicalizePid(trimmed);
// Safe interpolation via escSql (no raw user input in the SQL string).
const canonEsc = escSql(canonical);

// Stored-side normalisation in SQL: lowercase + collapse ark:/ → ark:
// (DuckDB's REPLACE is case-sensitive on the search string, so we LOWER
// first, then replace the already-lowercased prefix.)
const storedNorm = `LOWER(REPLACE(pid, 'ark:/', 'ark:'))`;

// Local-part fallback: strip everything up to and including the last '/'
// in the *canonical* form, leaving the bare local identifier.
const slashIdx = canonical.lastIndexOf('/');
const localPart = slashIdx >= 0 ? canonical.slice(slashIdx + 1) : canonical;
const localEsc = escapeIlikePattern(localPart);

// Combine: exact normalised match OR bare-localpart substring match.
// The substring match is deliberately narrow (must appear somewhere in pid)
// so it doesn't produce false hits on label/description columns.
return `(${storedNorm} = '${canonEsc}' OR pid ILIKE '%${localEsc}%' ESCAPE '\\')`;
}
21 changes: 20 additions & 1 deletion explorer.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -858,6 +858,9 @@ escSql = _sqlBuilders.escSql
escapeIlikePattern = _sqlBuilders.escapeIlikePattern
textSearchWhere = _sqlBuilders.textSearchWhere
textSearchScore = _sqlBuilders.textSearchScore
canonicalizePid = _sqlBuilders.canonicalizePid
looksLikePid = _sqlBuilders.looksLikePid
pidSearchWhere = _sqlBuilders.pidSearchWhere

_explorerUtils = await import(new URL('assets/js/explorer-utils.js', document.baseURI).href)
escapeHtml = _explorerUtils.escapeHtml
Expand Down Expand Up @@ -5379,6 +5382,22 @@ zoomWatcher = {
{ col: 'description', weight: 1 },
{ col: 'CAST(place_name AS VARCHAR)', weight: 2 },
]);

// #278/#26 — PID-aware search: when any term looks like a persistent
// identifier (ARK, IGSN, DOI, resolver URL), OR a normalised PID
// predicate into the WHERE so the sample is found even though its pid
// doesn't appear in label/description/place_name. Plain-text terms
// (e.g. "pottery") are unaffected — no pid column is touched for them,
// preserving the existing hot-path performance.
const pidPredicates = terms
.filter(t => looksLikePid(t))
.map(t => pidSearchWhere(t));
// Combine: original text-field match OR any pid predicate.
// If no terms look like PIDs, fullWhere equals searchWhere exactly.
const fullWhere = pidPredicates.length
? `(${searchWhere} OR ${pidPredicates.join(' OR ')})`
: searchWhere;

// pid is unique in facets_url (verified in A1 scoping), so the build
// is naturally one row per pid — no DISTINCT needed. (The previous
// `SELECT DISTINCT pid` deduped a single column; a 5-column DISTINCT
Expand All @@ -5397,7 +5416,7 @@ zoomWatcher = {
CREATE OR REPLACE TABLE ${staging} AS
SELECT pid, label, source, place_name, (${score}) AS relevance_score
FROM read_parquet('${facets_url}')
WHERE pid IS NOT NULL AND ${searchWhere}
WHERE pid IS NOT NULL AND ${fullWhere}
`);
if (token !== _searchFilterToken) return false; // superseded mid-build
await db.query(`CREATE OR REPLACE TABLE search_pids AS
Expand Down
216 changes: 216 additions & 0 deletions tests/unit/sql-builders-pid.test.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
// Unit tests for PID-search helpers added to assets/js/sql-builders.js
// (issues #278 search-by-PID, #26 modern-vs-classic ARK).
// Run: node --test tests/unit/ (Node built-ins only, no install)
import { test } from 'node:test';
import assert from 'node:assert/strict';
import {
canonicalizePid, looksLikePid, pidSearchWhere,
} from '../../assets/js/sql-builders.js';

// ---------------------------------------------------------------------------
// canonicalizePid
// ---------------------------------------------------------------------------

test('canonicalizePid: classic ARK (ark:/) → modern form (ark:)', () => {
assert.equal(canonicalizePid('ark:/28722/k2000027w'), 'ark:28722/k2000027w');
});

test('canonicalizePid: modern ARK already canonical — no change', () => {
assert.equal(canonicalizePid('ark:28722/k2000027w'), 'ark:28722/k2000027w');
});

test('canonicalizePid: resolver URL stripped then ARK collapsed', () => {
assert.equal(
canonicalizePid('https://n2t.net/ark:/28722/k2000027w'),
'ark:28722/k2000027w'
);
assert.equal(
canonicalizePid('http://n2t.net/ark:28722/k2000027w'),
'ark:28722/k2000027w'
);
// arks.org resolver
assert.equal(
canonicalizePid('https://arks.org/ark:/28722/k2000027w'),
'ark:28722/k2000027w'
);
});

test('canonicalizePid: IGSN with prefix — lowercased, otherwise unchanged', () => {
assert.equal(canonicalizePid('IGSN:IEGIL000C'), 'igsn:iegil000c');
});

test('canonicalizePid: IGSN lowercase already', () => {
assert.equal(canonicalizePid('igsn:iegil000c'), 'igsn:iegil000c');
});

test('canonicalizePid: bare local identifier passed through lowercased', () => {
// No scheme prefix — just lowercase; caller handles fallback matching.
assert.equal(canonicalizePid('IEGIL000C'), 'iegil000c');
});

test('canonicalizePid: DOI resolver URL stripped', () => {
assert.equal(
canonicalizePid('https://doi.org/10.5281/zenodo.123'),
'10.5281/zenodo.123'
);
});

test('canonicalizePid: handle.net resolver URL stripped', () => {
assert.equal(
canonicalizePid('https://hdl.handle.net/20.500.12535/abc'),
'20.500.12535/abc'
);
});

test('canonicalizePid: trims surrounding whitespace', () => {
assert.equal(canonicalizePid(' IGSN:IEGIL000C '), 'igsn:iegil000c');
});

// ---------------------------------------------------------------------------
// looksLikePid
// ---------------------------------------------------------------------------

test('looksLikePid: classic ARK', () => {
assert.equal(looksLikePid('ark:/28722/k2000027w'), true);
});

test('looksLikePid: modern ARK', () => {
assert.equal(looksLikePid('ark:28722/k2000027w'), true);
});

test('looksLikePid: IGSN with prefix', () => {
assert.equal(looksLikePid('IGSN:IEGIL000C'), true);
assert.equal(looksLikePid('igsn:iegil000c'), true);
});

test('looksLikePid: DOI scheme', () => {
assert.equal(looksLikePid('doi:10.5281/zenodo.123'), true);
});

test('looksLikePid: bare DOI (starts with 10.)', () => {
assert.equal(looksLikePid('10.5281/zenodo.123'), true);
});

test('looksLikePid: resolver URL', () => {
assert.equal(looksLikePid('https://n2t.net/ark:/28722/k2000027w'), true);
assert.equal(looksLikePid('https://doi.org/10.5281/zenodo.123'), true);
});

test('looksLikePid: plain text is NOT a PID', () => {
assert.equal(looksLikePid('pottery'), false);
assert.equal(looksLikePid('archaeological site'), false);
assert.equal(looksLikePid('basalt'), false);
});

test('looksLikePid: bare local identifier without scheme is NOT detected', () => {
// A bare local part like IEGIL000C has no scheme — the user must include
// "igsn:" for the heuristic to fire. This is intentional: bare words
// might be meaningful text and we don't want to route them via pid-search.
assert.equal(looksLikePid('IEGIL000C'), false);
// However, the pidSearchWhere localpart fallback still catches it when
// the caller pairs it with a fully-prefixed term, or when the caller
// routes it deliberately. See the integration comment in explorer.qmd.
});

// ---------------------------------------------------------------------------
// pidSearchWhere — SQL fragment shape and injection safety
// ---------------------------------------------------------------------------

test('pidSearchWhere: classic ARK produces normalised equality + localpart ILIKE', () => {
const sql = pidSearchWhere('ark:/28722/k2000027w');
// Stored-side normalisation:
assert.ok(sql.includes("LOWER(REPLACE(pid, 'ark:/', 'ark:'))"), 'stored-side LOWER+REPLACE present');
// Canonical form (ark: not ark:/) in the equality comparison:
assert.ok(sql.includes("= 'ark:28722/k2000027w'"), 'canonical ARK in equality');
// Localpart fallback (the part after the last '/'):
assert.ok(sql.includes("pid ILIKE '%k2000027w%'"), 'localpart ILIKE fallback');
// Both sides wrapped in outer parens:
assert.ok(sql.startsWith('(') && sql.endsWith(')'), 'outer parens');
});

test('pidSearchWhere: IGSN with prefix', () => {
const sql = pidSearchWhere('IGSN:IEGIL000C');
assert.ok(sql.includes("= 'igsn:iegil000c'"), 'canonical IGSN equality');
// No '/' in the IGSN value, so localpart equals the full canonical form.
// The fallback ILIKE therefore matches the whole canonical pid substring.
assert.ok(sql.includes("pid ILIKE '%igsn:iegil000c%'"), 'IGSN full-canonical ILIKE fallback');
});

test('pidSearchWhere: resolver URL (n2t.net) — prefix stripped in canonical', () => {
const sql = pidSearchWhere('https://n2t.net/ark:/28722/k2000027w');
assert.ok(sql.includes("= 'ark:28722/k2000027w'"), 'resolver prefix stripped in canonical');
assert.ok(sql.includes("pid ILIKE '%k2000027w%'"), 'localpart fallback');
});

test('pidSearchWhere: injection-safe — single quotes escaped', () => {
// A malicious value with a single quote must not break the SQL string.
// canonicalizePid lowercases first, so "O'Malley" → "o'malley" → "o''malley".
const sql = pidSearchWhere("ark:/99999/O'Malley");
assert.ok(!sql.includes("'o'malley'"), 'raw unescaped single quote not present in canonical');
assert.ok(sql.includes("o''malley"), 'single quote properly doubled (after lowercasing)');
});

test('pidSearchWhere: injection-safe — LIKE metacharacters escaped in localpart', () => {
const sql = pidSearchWhere('igsn:test_50%boom');
// _ and % must be backslash-escaped in the ILIKE pattern
assert.ok(sql.includes('\\_'), 'underscore escaped');
assert.ok(sql.includes('\\%'), 'percent escaped');
});

// ---------------------------------------------------------------------------
// pid: escape hatch (scheme-agnostic substring search)
// ---------------------------------------------------------------------------

test('looksLikePid: pid: prefix is recognised', () => {
assert.equal(looksLikePid('pid:IEGIL000C'), true);
assert.equal(looksLikePid('pid:k2000027w'), true);
// case-insensitive prefix
assert.equal(looksLikePid('PID:foo'), true);
assert.equal(looksLikePid('Pid:foo'), true);
});

test('pidSearchWhere: pid: prefix — emits bare ILIKE, no canonical exact-match arm', () => {
// Verified against production: pid:IEGIL000C → 1 row (IGSN:IEGIL000C)
const sql = pidSearchWhere('pid:IEGIL000C');
// Fragment is preserved as-is (no lowercasing) — ILIKE is case-insensitive
// so the match still works regardless of stored case.
assert.equal(sql, "pid ILIKE '%IEGIL000C%' ESCAPE '\\'");
// No LOWER(REPLACE(...)) canonical arm present.
assert.ok(!sql.includes('LOWER(REPLACE'), 'no canonical arm for pid: terms');
});

test('pidSearchWhere: pid: prefix — localpart without scheme (k2000027w)', () => {
// Verified against production: pid:k2000027w → 1 row (ark:/28722/k2000027w)
const sql = pidSearchWhere('pid:k2000027w');
assert.equal(sql, "pid ILIKE '%k2000027w%' ESCAPE '\\'");
});

test('pidSearchWhere: pid: prefix — case-insensitive prefix strip (PID:)', () => {
// The "pid:" prefix is stripped regardless of case; remainder is lowercased
// by escapeIlikePattern → escSql (no lowercasing in the pid: path, but
// ILIKE is case-insensitive so the fragment matches regardless).
const sql = pidSearchWhere('PID:IEGIL000C');
// Fragment is kept as-is through escaping (ILIKE handles case).
assert.equal(sql, "pid ILIKE '%IEGIL000C%' ESCAPE '\\'");
});

test('pidSearchWhere: pid: prefix — injection-safe (single quote in fragment)', () => {
const sql = pidSearchWhere("pid:O'Brien");
// Single quote must be doubled; no raw quote in output.
assert.ok(!sql.includes("'O'Brien'"), 'raw unescaped single quote absent');
assert.ok(sql.includes("O''Brien"), 'single quote doubled');
});

test('pidSearchWhere: pid: prefix — injection-safe (LIKE metacharacters in fragment)', () => {
const sql = pidSearchWhere('pid:test_50%boom');
assert.ok(sql.includes('\\_'), 'underscore escaped');
assert.ok(sql.includes('\\%'), 'percent escaped');
});

test('Option A: bare local identifiers (no scheme) are NOT routed via looksLikePid', () => {
// This confirms the default behaviour is unchanged — only scheme-prefixed or
// pid:-prefixed terms get PID matching. Plain words go through text search.
assert.equal(looksLikePid('IEGIL000C'), false);
assert.equal(looksLikePid('vdm_19600211'), false);
assert.equal(looksLikePid('pottery'), false);
});
Loading