From 0090931a015d17360a599ed17565d55fbb2e68ac Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Wed, 24 Jun 2026 14:49:31 +0800 Subject: [PATCH] Add saliency: spectral-residual visual saliency (where to look) When there's no template, colour or text to key on, an agent still needs a cue for where to look. Compute the spectral-residual saliency map (Hou & Zhang 2007) and rank salient boxes in source coordinates. Pure numpy FFT (cv2.saliency is opencv-contrib, forbidden), reusing visual_match's grayscale loader and cv2_utils.blobs.connected_boxes; regions threshold at mean+2*std by default. A coarse attention cue to narrow where a template / OCR pass then looks. --- WHATS_NEW.md | 6 ++ .../doc/new_features/v190_features_doc.rst | 49 +++++++++ .../Zh/doc/new_features/v190_features_doc.rst | 42 ++++++++ je_auto_control/__init__.py | 5 + .../gui/script_builder/command_schema.py | 18 ++++ .../utils/executor/action_executor.py | 23 ++++ .../utils/mcp_server/tools/_factories.py | 29 +++++ .../utils/mcp_server/tools/_handlers.py | 11 ++ je_auto_control/utils/saliency/__init__.py | 6 ++ je_auto_control/utils/saliency/saliency.py | 101 ++++++++++++++++++ .../unit_test/headless/test_saliency_batch.py | 79 ++++++++++++++ 11 files changed, 369 insertions(+) create mode 100644 docs/source/Eng/doc/new_features/v190_features_doc.rst create mode 100644 docs/source/Zh/doc/new_features/v190_features_doc.rst create mode 100644 je_auto_control/utils/saliency/__init__.py create mode 100644 je_auto_control/utils/saliency/saliency.py create mode 100644 test/unit_test/headless/test_saliency_batch.py diff --git a/WHATS_NEW.md b/WHATS_NEW.md index 1e16dbb5..4c39e190 100644 --- a/WHATS_NEW.md +++ b/WHATS_NEW.md @@ -1,5 +1,11 @@ # What's New — AutoControl +## What's new (2026-06-24) — Visual Saliency (where to look — spectral-residual) + +Find the region that stands out, with no template / colour / text. Full reference: [`docs/source/Eng/doc/new_features/v190_features_doc.rst`](docs/source/Eng/doc/new_features/v190_features_doc.rst). + +- **`saliency_map` / `salient_regions` / `most_salient`** (`AC_salient_regions`, `AC_most_salient`): when there's no template, colour or text to key on, an agent still needs a cue for *where to look*. This computes the spectral-residual saliency map (Hou & Zhang 2007 — log amplitude minus its local average, reconstructed through the phase) and turns it into ranked salient boxes in source pixel coordinates. The transform is a pure numpy FFT (`cv2.saliency` is in the forbidden opencv-contrib package, so it's re-implemented over base opencv); it reuses `visual_match`'s grayscale loader and `cv2_utils.blobs.connected_boxes`. Regions threshold at `mean + 2·std` by default. A coarse attention cue to *narrow* where a template / OCR pass then looks. No `PySide6`. + ## What's new (2026-06-24) — Display-Scale / Visual-DPI Detection Infer which display scale (DPI) a template renders at — and how confidently. Full reference: [`docs/source/Eng/doc/new_features/v189_features_doc.rst`](docs/source/Eng/doc/new_features/v189_features_doc.rst). diff --git a/docs/source/Eng/doc/new_features/v190_features_doc.rst b/docs/source/Eng/doc/new_features/v190_features_doc.rst new file mode 100644 index 00000000..4ebf33a0 --- /dev/null +++ b/docs/source/Eng/doc/new_features/v190_features_doc.rst @@ -0,0 +1,49 @@ +Visual Saliency (where to look — spectral-residual) +=================================================== + +When there is no template, no known colour and no text to OCR, an agent still +needs a cue for *where to look* — the region that stands out from its +surroundings (a popup, a badge, a highlighted row). ``saliency`` computes the +spectral-residual saliency map (Hou & Zhang 2007) — ``log`` amplitude minus its +local average, reconstructed through the phase — and turns it into ranked salient +boxes. + +* :func:`saliency_map` — the normalised (0–1) saliency map as an ndarray, +* :func:`salient_regions` — ranked salient boxes ``{x, y, width, height, center, + score}`` in source pixel coordinates, +* :func:`most_salient` — the single most salient region (the first place to look). + +The transform is a pure ``numpy`` FFT — ``cv2.saliency`` lives in the forbidden +opencv-contrib package, so it is re-implemented over base opencv only. It reuses +``visual_match``'s grayscale loader (any ndarray / path / PIL image, or the live +screen) and ``cv2_utils.blobs.connected_boxes`` for region extraction. cv2 / +numpy are lazily imported. Imports no ``PySide6``. + +Headless API +------------ + +.. code-block:: python + + from je_auto_control import saliency_map, salient_regions, most_salient + + most_salient("screen.png") + # {"x": 612, "y": 40, "width": 180, "height": 36, "center": [702, 58], + # "score": 0.82} + + for region in salient_regions("screen.png"): # most-salient first + ... + + sal = saliency_map("screen.png") # (64, 64) float32 in 0..1 + +Regions are thresholded at ``mean + 2·std`` of the saliency map by default (pass +``threshold`` to override), extracted with ``connected_boxes`` and scaled back to +the source's pixel coordinates. ``size`` is the (small) resolution the saliency is +computed at. Saliency is a coarse attention cue, not a precise detector — use it +to *narrow* where a template / OCR pass then looks. + +Executor commands +----------------- + +``AC_salient_regions`` and ``AC_most_salient`` (``source`` / ``region`` / ``size`` +/ ``threshold`` / ``min_area``). They are exposed as read-only ``ac_*`` MCP tools +and as Script Builder commands under **Image**. diff --git a/docs/source/Zh/doc/new_features/v190_features_doc.rst b/docs/source/Zh/doc/new_features/v190_features_doc.rst new file mode 100644 index 00000000..6167c75c --- /dev/null +++ b/docs/source/Zh/doc/new_features/v190_features_doc.rst @@ -0,0 +1,42 @@ +視覺顯著度(該看哪裡——spectral-residual) +========================================== + +當沒有模板、沒有已知顏色、也沒有文字可 OCR 時,agent 仍需要一個*該看哪裡*的線索——也就是從 +周遭凸顯出來的區域(彈出視窗、徽章、被反白的列)。``saliency`` 計算 spectral-residual 顯著度圖 +(Hou & Zhang 2007)——``log`` 振幅減去其區域平均,再透過相位重建——並轉成排序後的顯著方框。 + +* :func:`saliency_map` ——正規化(0–1)的顯著度圖(ndarray), +* :func:`salient_regions` ——排序後的顯著方框 ``{x, y, width, height, center, score}`` + (以來源像素座標表示), +* :func:`most_salient` ——單一最顯著的區域(第一個該看的地方)。 + +此轉換為純 ``numpy`` FFT——``cv2.saliency`` 位於被禁用的 opencv-contrib 套件,故在 base opencv +上重新實作。它重用 ``visual_match`` 的灰階載入器(任何 ndarray / 路徑 / PIL 影像,或存活螢幕)與 +``cv2_utils.blobs.connected_boxes`` 做區域擷取。cv2 / numpy 為延遲匯入。不匯入 ``PySide6``。 + +無頭 API +-------- + +.. code-block:: python + + from je_auto_control import saliency_map, salient_regions, most_salient + + most_salient("screen.png") + # {"x": 612, "y": 40, "width": 180, "height": 36, "center": [702, 58], + # "score": 0.82} + + for region in salient_regions("screen.png"): # 最顯著者在前 + ... + + sal = saliency_map("screen.png") # (64, 64) float32,範圍 0..1 + +區域預設以顯著度圖的 ``mean + 2·std`` 為門檻(可傳 ``threshold`` 覆寫),以 ``connected_boxes`` +擷取,並縮放回來源的像素座標。``size`` 是計算顯著度所用的(較小)解析度。顯著度是粗略的注意力 +線索,而非精確偵測器——用它來*縮小*接著由模板 / OCR 比對的範圍。 + +執行器指令 +---------- + +``AC_salient_regions`` 與 ``AC_most_salient``(``source`` / ``region`` / ``size`` / +``threshold`` / ``min_area``)。皆以唯讀 ``ac_*`` MCP 工具及 Script Builder 指令(位於 **Image** +分類下)形式提供。 diff --git a/je_auto_control/__init__.py b/je_auto_control/__init__.py index 405a84b5..0cd404e9 100644 --- a/je_auto_control/__init__.py +++ b/je_auto_control/__init__.py @@ -84,6 +84,10 @@ ) # Display-scale / visual-DPI detection (per-scale match profile) from je_auto_control.utils.scale_detect import detect_scale, scale_sweep +# Spectral-residual visual saliency (where to look — map + salient regions) +from je_auto_control.utils.saliency import ( + most_salient, salient_regions, saliency_map, +) # VLM element locator (headless) from je_auto_control.utils.vision import ( VLMNotAvailableError, click_by_description, locate_by_description, @@ -1660,6 +1664,7 @@ def start_autocontrol_gui(*args, **kwargs): "plan_file_drop", "drop_files", "image_quality", "is_blurry", "quality_gate", "detect_scale", "scale_sweep", + "saliency_map", "salient_regions", "most_salient", # VLM locator "VLMNotAvailableError", "locate_by_description", "click_by_description", "verify_description", diff --git a/je_auto_control/gui/script_builder/command_schema.py b/je_auto_control/gui/script_builder/command_schema.py index 43d2eeb7..8019fb23 100644 --- a/je_auto_control/gui/script_builder/command_schema.py +++ b/je_auto_control/gui/script_builder/command_schema.py @@ -787,6 +787,24 @@ def _add_image_specs(specs: List[CommandSpec]) -> None: ), description="Per-scale match-score profile of a template.", )) + saliency_fields = ( + FieldSpec("source", FieldType.FILE_PATH, optional=True), + FieldSpec("region", FieldType.STRING, optional=True, + placeholder=_REGION_PLACEHOLDER), + FieldSpec("size", FieldType.INT, optional=True, default=64), + FieldSpec("threshold", FieldType.FLOAT, optional=True), + FieldSpec("min_area", FieldType.INT, optional=True, default=4), + ) + specs.append(CommandSpec( + "AC_salient_regions", "Image", "Salient Regions", + fields=saliency_fields, + description="Visually salient regions (spectral-residual; where to look).", + )) + specs.append(CommandSpec( + "AC_most_salient", "Image", "Most Salient Region", + fields=saliency_fields, + description="The single most visually salient region of an image/screen.", + )) specs.append(CommandSpec( "AC_changed_regions", "Image", "Changed Regions (motion)", fields=( diff --git a/je_auto_control/utils/executor/action_executor.py b/je_auto_control/utils/executor/action_executor.py index 0f3eaf37..39a99780 100644 --- a/je_auto_control/utils/executor/action_executor.py +++ b/je_auto_control/utils/executor/action_executor.py @@ -4327,6 +4327,27 @@ def _scale_sweep(template: Any, haystack: Any = None, region: Any = None, method=str(method))} +def _salient_regions(source: Any = None, region: Any = None, size: Any = 64, + threshold: Any = None, min_area: Any = 4) -> Dict[str, Any]: + """Adapter: ranked visually-salient regions of an image / the screen.""" + from je_auto_control.utils.saliency import salient_regions + cut = float(threshold) if threshold not in (None, "") else None + regions = salient_regions(source, region=_coerce_region(region), + size=int(size), threshold=cut, + min_area=int(min_area)) + return {"regions": regions, "count": len(regions)} + + +def _most_salient(source: Any = None, region: Any = None, size: Any = 64, + threshold: Any = None, min_area: Any = 4) -> Dict[str, Any]: + """Adapter: the single most visually-salient region (where to look).""" + from je_auto_control.utils.saliency import most_salient + cut = float(threshold) if threshold not in (None, "") else None + result = most_salient(source, region=_coerce_region(region), + size=int(size), threshold=cut, min_area=int(min_area)) + return {"found": result is not None, "region": result} + + def _image_histogram(source: Any = None, bins: Any = 32, space: str = "hsv", region: Any = None) -> Dict[str, Any]: """Adapter: per-channel colour histogram of an image / the screen.""" @@ -6553,6 +6574,8 @@ def __init__(self): "AC_quality_gate": _quality_gate, "AC_detect_scale": _detect_scale, "AC_scale_sweep": _scale_sweep, + "AC_salient_regions": _salient_regions, + "AC_most_salient": _most_salient, "AC_image_histogram": _image_histogram, "AC_histogram_changed": _histogram_changed, "AC_changed_regions": _changed_regions, diff --git a/je_auto_control/utils/mcp_server/tools/_factories.py b/je_auto_control/utils/mcp_server/tools/_factories.py index efd97227..b84715a1 100644 --- a/je_auto_control/utils/mcp_server/tools/_factories.py +++ b/je_auto_control/utils/mcp_server/tools/_factories.py @@ -3443,6 +3443,35 @@ def img_histogram_tools() -> List[MCPTool]: handler=h.scale_sweep, annotations=READ_ONLY, ), + MCPTool( + name="ac_salient_regions", + description=("Visually salient regions of 'source' (image path; " + "default screen grab of 'region') via spectral-residual " + "saliency — where to look with no template/text. Returns " + "{regions:[{x,y,width,height,center,score}], count}."), + input_schema=schema({ + "source": {"type": "string"}, + "region": {"type": "array", "items": {"type": "integer"}}, + "size": {"type": "integer"}, + "threshold": {"type": "number"}, + "min_area": {"type": "integer"}}), + handler=h.salient_regions, + annotations=READ_ONLY, + ), + MCPTool( + name="ac_most_salient", + description=("The single most visually salient region of 'source' " + "(default screen): {found, region:{x,y,width,height," + "center,score}}. The first place to look."), + input_schema=schema({ + "source": {"type": "string"}, + "region": {"type": "array", "items": {"type": "integer"}}, + "size": {"type": "integer"}, + "threshold": {"type": "number"}, + "min_area": {"type": "integer"}}), + handler=h.most_salient, + annotations=READ_ONLY, + ), ] diff --git a/je_auto_control/utils/mcp_server/tools/_handlers.py b/je_auto_control/utils/mcp_server/tools/_handlers.py index 3c2016e7..c00840fe 100644 --- a/je_auto_control/utils/mcp_server/tools/_handlers.py +++ b/je_auto_control/utils/mcp_server/tools/_handlers.py @@ -2532,6 +2532,17 @@ def scale_sweep(template, haystack=None, region=None, scales=None, return _scale_sweep(template, haystack, region, scales, method) +def salient_regions(source=None, region=None, size=64, threshold=None, + min_area=4): + from je_auto_control.utils.executor.action_executor import _salient_regions + return _salient_regions(source, region, size, threshold, min_area) + + +def most_salient(source=None, region=None, size=64, threshold=None, min_area=4): + from je_auto_control.utils.executor.action_executor import _most_salient + return _most_salient(source, region, size, threshold, min_area) + + def image_histogram(source=None, bins=32, space="hsv", region=None): from je_auto_control.utils.executor.action_executor import _image_histogram return _image_histogram(source, bins, space, region) diff --git a/je_auto_control/utils/saliency/__init__.py b/je_auto_control/utils/saliency/__init__.py new file mode 100644 index 00000000..396700d2 --- /dev/null +++ b/je_auto_control/utils/saliency/__init__.py @@ -0,0 +1,6 @@ +"""Spectral-residual visual saliency: map + ranked salient regions (numpy FFT).""" +from je_auto_control.utils.saliency.saliency import ( + most_salient, salient_regions, saliency_map, +) + +__all__ = ["saliency_map", "salient_regions", "most_salient"] diff --git a/je_auto_control/utils/saliency/saliency.py b/je_auto_control/utils/saliency/saliency.py new file mode 100644 index 00000000..a4dd92cd --- /dev/null +++ b/je_auto_control/utils/saliency/saliency.py @@ -0,0 +1,101 @@ +"""Find the visually salient regions of a frame (spectral-residual saliency). + +When there is no template, no known colour and no text to OCR, an agent still +needs a cue for *where to look* — the region that stands out from its +surroundings (a popup, a badge, a highlighted row). ``saliency`` computes the +spectral-residual saliency map (Hou & Zhang 2007) — ``log`` amplitude minus its +local average, reconstructed through the phase — and turns it into ranked salient +boxes. + +The transform is a pure ``numpy`` FFT (``cv2.saliency`` lives in the forbidden +opencv-contrib package, so it is re-implemented here over base opencv only). It +reuses ``visual_match``'s grayscale loader for the source (any ndarray / path / +PIL image, or the live screen) and ``cv2_utils.blobs.connected_boxes`` for the +region extraction. cv2 / numpy are lazily imported. Imports no ``PySide6``. +""" +from typing import Any, Dict, List, Optional, Sequence, Tuple + +ImageSource = Any + + +def _gray(source: Optional[ImageSource], region: Optional[Sequence[int]]): + from je_auto_control.utils.visual_match.visual_match import _haystack_gray + return _haystack_gray(source, region) + + +def _saliency_from_gray(gray, size: int): + import cv2 + import numpy as np + small = cv2.resize(gray, (size, size), + interpolation=cv2.INTER_AREA).astype(np.float32) + fft = np.fft.fft2(small) + log_amplitude = np.log(np.abs(fft) + 1e-8) + residual = log_amplitude - cv2.blur(log_amplitude, (3, 3)) + recon = np.fft.ifft2(np.exp(residual + 1j * np.angle(fft))) + smoothed = cv2.GaussianBlur(np.abs(recon) ** 2, (0, 0), sigmaX=3.0) + peak = float(smoothed.max()) + if peak > 0: + smoothed = smoothed / peak + return smoothed.astype(np.float32) + + +def saliency_map(source: Optional[ImageSource] = None, *, + region: Optional[Sequence[int]] = None, size: int = 64): + """Return the normalised (0–1) spectral-residual saliency map as an ndarray. + + The map is computed at ``size`` x ``size`` (the algorithm's native low + resolution); higher = more salient. + """ + return _saliency_from_gray(_gray(source, region), int(size)) + + +def _regions_from_saliency(saliency, orig_shape: Tuple[int, int], + threshold: Optional[float], min_area: int, + size: int) -> List[Dict[str, Any]]: + from je_auto_control.utils.cv2_utils.blobs import connected_boxes + if threshold is not None: + cut = float(threshold) + else: # scale-invariant: regions standing 2 std above the mean saliency + cut = float(saliency.mean()) + 2.0 * float(saliency.std()) + mask = (saliency >= cut).astype("uint8") * 255 + orig_height, orig_width = int(orig_shape[0]), int(orig_shape[1]) + scale_x, scale_y = orig_width / float(size), orig_height / float(size) + regions: List[Dict[str, Any]] = [] + for box in connected_boxes(mask, min_area=min_area): + x, y = int(box["x"] * scale_x), int(box["y"] * scale_y) + width = max(1, int(box["width"] * scale_x)) + height = max(1, int(box["height"] * scale_y)) + patch = saliency[box["y"]:box["y"] + box["height"], + box["x"]:box["x"] + box["width"]] + score = float(patch.mean()) if patch.size else 0.0 + regions.append({"x": x, "y": y, "width": width, "height": height, + "center": [x + width // 2, y + height // 2], + "score": score}) + regions.sort(key=lambda region: region["score"], reverse=True) + return regions + + +def salient_regions(source: Optional[ImageSource] = None, *, + region: Optional[Sequence[int]] = None, size: int = 64, + threshold: Optional[float] = None, + min_area: int = 4) -> List[Dict[str, Any]]: + """Return salient regions as ``[{x, y, width, height, center, score}]``. + + Boxes are thresholded from the saliency map (default cut = 3x the mean, + per Hou & Zhang), extracted with ``connected_boxes`` and scaled back to the + source's pixel coordinates, ranked most-salient first. + """ + gray = _gray(source, region) + saliency = _saliency_from_gray(gray, int(size)) + return _regions_from_saliency(saliency, gray.shape[:2], threshold, + int(min_area), int(size)) + + +def most_salient(source: Optional[ImageSource] = None, *, + region: Optional[Sequence[int]] = None, size: int = 64, + threshold: Optional[float] = None, + min_area: int = 4) -> Optional[Dict[str, Any]]: + """Return the single most salient region, or ``None`` if none stand out.""" + regions = salient_regions(source, region=region, size=size, + threshold=threshold, min_area=min_area) + return regions[0] if regions else None diff --git a/test/unit_test/headless/test_saliency_batch.py b/test/unit_test/headless/test_saliency_batch.py new file mode 100644 index 00000000..ccd9a964 --- /dev/null +++ b/test/unit_test/headless/test_saliency_batch.py @@ -0,0 +1,79 @@ +"""Headless tests for spectral-residual saliency (cv2/numpy synthetic frames).""" +import pytest + +import je_auto_control as ac + +np = pytest.importorskip("numpy") +pytest.importorskip("cv2") + +from je_auto_control.utils.saliency import ( # noqa: E402 + most_salient, salient_regions, saliency_map, +) + + +def _structured(): + """A dark frame with three bright blocks.""" + img = np.full((240, 320, 3), 20, "uint8") + img[40:80, 40:80] = 240 + img[150:190, 200:240] = 230 + img[100:130, 140:175] = 255 + return img + + +def test_saliency_map_shape_and_range(): + sal_map = saliency_map(_structured()) + assert sal_map.shape == (64, 64) + assert sal_map.dtype == np.float32 + assert float(sal_map.min()) >= 0.0 + assert float(sal_map.max()) <= 1.0 + + +def test_size_parameter_changes_map_resolution(): + assert saliency_map(_structured(), size=32).shape == (32, 32) + + +def test_salient_regions_in_bounds_and_ranked(): + regions = salient_regions(_structured()) + assert len(regions) >= 1 + for region in regions: + assert 0 <= region["x"] and region["x"] + region["width"] <= 320 + assert 0 <= region["y"] and region["y"] + region["height"] <= 240 + assert 0.0 <= region["score"] <= 1.0 + scores = [r["score"] for r in regions] + assert scores == sorted(scores, reverse=True) + + +def test_most_salient_matches_top_region(): + img = _structured() + top = most_salient(img) + assert top is not None and top == salient_regions(img)[0] + + +def test_high_threshold_yields_nothing(): + img = _structured() + assert salient_regions(img, threshold=2.0) == [] # above the normalised max + assert most_salient(img, threshold=2.0) is None + + +# --- wiring --------------------------------------------------------------- + +def test_executor_pure_path(): + from je_auto_control.utils.executor.action_executor import _salient_regions + out = _salient_regions(_structured()) + assert isinstance(out["regions"], list) and len(out["regions"]) >= 1 + + +def test_wiring(): + known = set(ac.executor.known_commands()) + assert {"AC_salient_regions", "AC_most_salient"} <= known + from je_auto_control.utils.mcp_server.tools import build_default_tool_registry + names = {t.name for t in build_default_tool_registry()} + assert {"ac_salient_regions", "ac_most_salient"} <= names + from je_auto_control.gui.script_builder.command_schema import _build_specs + specs = {s.command for s in _build_specs()} + assert {"AC_salient_regions", "AC_most_salient"} <= specs + + +def test_facade_exports(): + for name in ("saliency_map", "salient_regions", "most_salient"): + assert hasattr(ac, name) and name in ac.__all__