Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
159315e
initial code for pydantic based validation for yaml files
omri374 Oct 19, 2025
cfc7c1b
Validation layer for YAML based configuration - cont'd
omri374 Nov 11, 2025
f7b54d4
Merge branch 'main' into omri/pydantic_validation
omri374 Nov 11, 2025
0fbd010
linting
omri374 Nov 11, 2025
c4841b5
Update presidio-analyzer/presidio_analyzer/input_validation/yaml_reco…
omri374 Nov 11, 2025
3939fc9
Update presidio-analyzer/presidio_analyzer/input_validation/yaml_reco…
omri374 Nov 11, 2025
d7cb69b
Update presidio-analyzer/tests/test_configuration_validator.py
omri374 Nov 11, 2025
3b61469
Update presidio-analyzer/presidio_analyzer/input_validation/schemas.py
omri374 Nov 11, 2025
251cefc
Update presidio-analyzer/presidio_analyzer/input_validation/schemas.py
omri374 Nov 11, 2025
1420bd5
Update presidio-analyzer/presidio_analyzer/input_validation/schemas.py
omri374 Nov 11, 2025
c677b79
Update presidio-analyzer/presidio_analyzer/input_validation/schemas.py
omri374 Nov 11, 2025
421e47d
Update presidio-analyzer/tests/test_recognizer_registry_provider.py
omri374 Nov 11, 2025
2901b13
Update presidio-analyzer/presidio_analyzer/input_validation/schemas.py
omri374 Nov 11, 2025
8b84370
Update presidio-analyzer/presidio_analyzer/input_validation/schemas.py
omri374 Nov 11, 2025
1cf7678
Merge branch 'main' into omri/pydantic_validation
omri374 Nov 17, 2025
108e3d0
ruff on the entire analyzer codebase
omri374 Nov 19, 2025
b492b79
Merge branch 'main' into omri/pydantic_validation
omri374 Nov 19, 2025
eb9f7c7
Merge remote-tracking branch 'origin/omri/pydantic_validation' into o…
omri374 Nov 19, 2025
bfd067b
Update presidio-analyzer/presidio_analyzer/input_validation/schemas.py
omri374 Nov 19, 2025
f2e7fd9
ruff and copilot review fixes
omri374 Nov 19, 2025
18df02e
merge
omri374 Nov 19, 2025
41328cc
Delete presidio-analyzer/test-output.xml
omri374 Nov 19, 2025
bd2d045
fixed bad test
omri374 Nov 19, 2025
11a8169
ruff
omri374 Nov 19, 2025
8054750
removed wrong test which assumes defaults
omri374 Nov 20, 2025
86baa12
Clean up comments in recognizers_loader_utils.py
omri374 Nov 20, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import yaml

from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
from presidio_analyzer.input_validation import ConfigurationValidator
from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
from presidio_analyzer.recognizer_registry import RecognizerRegistryProvider

Expand All @@ -29,14 +30,21 @@ def __init__(
nlp_engine_conf_file: Optional[Union[Path, str]] = None,
recognizer_registry_conf_file: Optional[Union[Path, str]] = None,
):
if analyzer_engine_conf_file:
ConfigurationValidator.validate_file_path(analyzer_engine_conf_file)
if nlp_engine_conf_file:
ConfigurationValidator.validate_file_path(nlp_engine_conf_file)
if recognizer_registry_conf_file:
ConfigurationValidator.validate_file_path(recognizer_registry_conf_file)

self.configuration = self.get_configuration(conf_file=analyzer_engine_conf_file)
self.nlp_engine_conf_file = nlp_engine_conf_file
self.recognizer_registry_conf_file = recognizer_registry_conf_file

def get_configuration(
self, conf_file: Optional[Union[Path, str]]
) -> Union[Dict[str, Any]]:
"""Retrieve the analyzer engine configuration from the provided file."""
"""Retrieve analyzer engine configuration from the provided file."""

if not conf_file:
default_conf_file = self._get_full_conf_path()
Expand All @@ -59,10 +67,18 @@ def get_configuration(
with open(self._get_full_conf_path()) as file:
configuration = yaml.safe_load(file)
except Exception:
print(f"Failed to parse file {conf_file}, resorting to default")
logger.warning(
f"Failed to parse file {conf_file}, resorting to default"
)
with open(self._get_full_conf_path()) as file:
configuration = yaml.safe_load(file)

# Validate configuration using Pydantic-based ConfigurationValidator
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this comment necessary?

from presidio_analyzer.input_validation import ConfigurationValidator
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this import be on top of the file ?


ConfigurationValidator.validate_analyzer_configuration(configuration)
logger.debug("Analyzer configuration validation passed")

return configuration

def create_engine(self) -> AnalyzerEngine:
Expand Down
5 changes: 3 additions & 2 deletions presidio-analyzer/presidio_analyzer/analyzer_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,6 @@ def __init__(self, req_data: Dict):
self.context = req_data.get("context")
self.allow_list = req_data.get("allow_list")
self.allow_list_match = req_data.get("allow_list_match", "exact")
self.regex_flags = req_data.get("regex_flags",
re.DOTALL | re.MULTILINE | re.IGNORECASE)
self.regex_flags = req_data.get(
"regex_flags", re.DOTALL | re.MULTILINE | re.IGNORECASE
)
154 changes: 154 additions & 0 deletions presidio-analyzer/presidio_analyzer/conf/default_analyzer_full.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
supported_languages:
- en
default_score_threshold: 0
nlp_configuration:
nlp_engine_name: spacy
models:
- lang_code: en
model_name: en_core_web_lg

ner_model_configuration:
model_to_presidio_entity_mapping:
PER: PERSON
PERSON: PERSON
NORP: NRP
FAC: LOCATION
LOC: LOCATION
LOCATION: LOCATION
GPE: LOCATION
ORG: ORGANIZATION
ORGANIZATION: ORGANIZATION
DATE: DATE_TIME
TIME: DATE_TIME

low_confidence_score_multiplier: 0.4
low_score_entity_names:
-
labels_to_ignore:
- ORG
- ORGANIZATION # has many false positives
- CARDINAL
- EVENT
- LANGUAGE
- LAW
- MONEY
- ORDINAL
- PERCENT
- PRODUCT
- QUANTITY
- WORK_OF_ART


recognizer_registry:
# global_regex_flags: 26
recognizers:
# Recognizers listed here can either be loaded from the recognizers defined in code (type: predefined),
# or created based on the provided configuration (type: custom).
# For predefined:
# - If only a recognizer name is provided, a predefined recognizer with this name and default parameters will be loaded.
# - If a parameter isn't provided, the default one would be loaded.
# For custom:
# - See an example configuration here: https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/conf/example_recognizers.yaml
# - Custom pattern recognizers with this configuration can be added to this file, with type: custom
# For recognizers supporting more than one language, an instance of the recognizer for each language will be created.
# For example, see the CreditCardRecognizer definition below:
- name: CreditCardRecognizer
supported_languages:
- language: en
context: [credit, card, visa, mastercard, cc, amex, discover, jcb, diners, maestro, instapayment]
type: predefined

- name: UsBankRecognizer
type: predefined

- name: UsLicenseRecognizer
type: predefined

- name: UsItinRecognizer
type: predefined

- name: UsPassportRecognizer
type: predefined

- name: UsSsnRecognizer
type: predefined

- name: NhsRecognizer
type: predefined

- name: UkNinoRecognizer
type: predefined
enabled: false

- name: SgFinRecognizer
type: predefined
enabled: false

- name: AuAbnRecognizer
type: predefined
enabled: false

- name: AuAcnRecognizer
type: predefined
enabled: false

- name: AuTfnRecognizer
type: predefined
enabled: false

- name: AuMedicareRecognizer
type: predefined
enabled: false

- name: InPanRecognizer
type: predefined
enabled: false

- name: InAadhaarRecognizer
supported_languages:
- en
type: predefined
enabled: false

- name: InVehicleRegistrationRecognizer
type: predefined
enabled: false

- name: InPassportRecognizer
type: predefined
enabled: false

- name: CryptoRecognizer
type: predefined

- name: DateRecognizer
type: predefined

- name: EmailRecognizer
type: predefined

- name: IbanRecognizer
type: predefined

- name: IpRecognizer
type: predefined

- name: MedicalLicenseRecognizer
type: predefined

- name: PhoneRecognizer
type: predefined

- name: UrlRecognizer
type: predefined

- name: InVoterRecognizer
type: predefined
enabled: false

- name: InGstinRecognizer
type: predefined
enabled: false

- name: SpacyRecognizer
type: predefined
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Context awareness modules."""

from .context_aware_enhancer import ContextAwareEnhancer
from .lemma_context_aware_enhancer import LemmaContextAwareEnhancer

Expand Down
19 changes: 19 additions & 0 deletions presidio-analyzer/presidio_analyzer/input_validation/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""Configuration validation module for Presidio."""

from .schemas import ConfigurationValidator
from .yaml_recognizer_models import (
BaseRecognizerConfig,
CustomRecognizerConfig,
LanguageContextConfig,
PredefinedRecognizerConfig,
RecognizerRegistryConfig,
)

__all__ = [
"ConfigurationValidator",
"BaseRecognizerConfig",
"CustomRecognizerConfig",
"LanguageContextConfig",
"PredefinedRecognizerConfig",
"RecognizerRegistryConfig",
]
Loading