Quellcode für meipi.indexing.search

"""PostgreSQL full-text search for indexed documents."""

from __future__ import annotations

from dataclasses import dataclass
from typing import Literal

import sqlalchemy as sa
from sqlalchemy import func, or_, select
from sqlalchemy.orm import Session

from .model import DBMeta

QueryMode = Literal["plain", "websearch", "phrase"]


[Doku] @dataclass(frozen=True, slots=True) class DocSearchHit: """One filemeta row matching a full-text query.""" meta_id: int path: str fname: str suffix: str rank: float snippet: str
def _tsquery(lang: str, query: str, mode: QueryMode): if mode == "plain": return func.plainto_tsquery(lang, query) if mode == "phrase": return func.phaseto_tsquery(lang, query) return func.websearch_to_tsquery(lang, query) def _metadata_text(): """Plain-text bundle of structural fields and Tika ``meta_data`` JSON.""" return func.concat( DBMeta.fname, sa.literal(" "), DBMeta.path, sa.literal(" "), DBMeta.ctype, sa.literal(" "), func.coalesce(sa.cast(DBMeta.meta_data, sa.Text()), ""), ) def _metadata_tsvector(lang: str): return func.to_tsvector(lang, _metadata_text())
[Doku] def search_documents( session: Session, *, pool_id: int, query: str, lang: str = "german", limit: int = 50, mode: QueryMode = "websearch", ) -> list[DocSearchHit]: """Search file bodies and metadata with PostgreSQL full-text matching. Matches rows where the query hits extracted content (``ts_content`` / ``inhalt``) or metadata (filename, path, content type, and Tika ``meta_data`` JSON). """ text = query.strip() if not text: return [] tsq = _tsquery(lang, text, mode) meta_ts = _metadata_tsvector(lang) content_match = DBMeta.ts_content.bool_op("@@")(tsq) meta_match = meta_ts.bool_op("@@")(tsq) rank = ( func.coalesce(func.ts_rank(DBMeta.ts_content, tsq), 0.0) + func.coalesce(func.ts_rank(meta_ts, tsq), 0.0) ) meta_text = _metadata_text() snippet = func.coalesce( func.nullif(func.ts_headline(lang, DBMeta.inhalt, tsq, type_=sa.Text()), ""), func.ts_headline(lang, meta_text, tsq, type_=sa.Text()), ) stmt = ( select( DBMeta.id.label("meta_id"), DBMeta.path, DBMeta.fname, DBMeta.suffix, rank.label("rank"), snippet.label("snippet"), ) .where(DBMeta.pool_id == pool_id) .where(or_(content_match, meta_match)) .order_by(rank.desc(), DBMeta.path) .limit(limit) ) return [ DocSearchHit( meta_id=row.meta_id, path=row.path, fname=row.fname, suffix=row.suffix, rank=float(row.rank), snippet=row.snippet or "", ) for row in session.execute(stmt) ]