Skip to content
This repository was archived by the owner on Feb 19, 2021. It is now read-only.

Commit 65b88c4

Browse files
committed
Don't parse dates with more than 4 digits for the year
The regex was broken before, using `(?!…)` instead of `(?<=…)`.
1 parent 66777bc commit 65b88c4

File tree

2 files changed

+28
-5
lines changed

2 files changed

+28
-5
lines changed

src/documents/parsers.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,11 @@
2121
# - MONTH ZZZZ, with ZZZZ being 4 digits
2222
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
2323
DATE_REGEX = re.compile(
24-
r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501
25-
r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501
26-
r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' + # NOQA: E501
27-
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' +
28-
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))'
24+
r'(\b|(?<=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501
25+
r'(\b|(?<=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501
26+
r'(\b|(?<=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' + # NOQA: E501
27+
r'(\b|(?<=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' +
28+
r'(\b|(?<=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))'
2929
)
3030

3131

src/paperless_tesseract/tests/test_date.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,3 +182,26 @@ def test_crazy_date_past(self, *args):
182182
document = RasterisedDocumentParser("/dev/null")
183183
document.get_text()
184184
self.assertIsNone(document.get_date())
185+
186+
EXTRA = {
187+
"123/04/2020/3423": None,
188+
"-23/04/2020-foo": "2020 04 23",
189+
"-23-04-2020-blurb": "2020 04 23",
190+
# gets parsed as month: 23, day: 04, which is invalid
191+
#"-2020-04-23-bar": "2020 04 23",
192+
"12020-04-23-": None,
193+
"-2020-04-234": None,
194+
}
195+
196+
@mock.patch(MOCK_SCRATCH, SCRATCH)
197+
def test_date_format_bulk(self):
198+
timezone = tz.gettz(settings.TIME_ZONE)
199+
for input, expected in self.EXTRA.items():
200+
if expected is not None:
201+
raw = [int(x) for x in expected.split()]
202+
expected = datetime.datetime(*raw, tzinfo=timezone)
203+
204+
input_file = os.path.join(self.SAMPLE_FILES, "")
205+
document = RasterisedDocumentParser(input_file)
206+
document._text = input
207+
self.assertEqual(document.get_date(), expected, msg="Test case {!r}".format(input))

0 commit comments

Comments
 (0)