This repository was archived by the owner on Feb 19, 2021. It is now read-only.
File tree Expand file tree Collapse file tree 2 files changed +28
-5
lines changed
paperless_tesseract/tests Expand file tree Collapse file tree 2 files changed +28
-5
lines changed Original file line number Diff line number Diff line change 2121# - MONTH ZZZZ, with ZZZZ being 4 digits
2222# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
2323DATE_REGEX = re .compile (
24- r'(\b|(?! =([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501
25- r'(\b|(?! =([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501
26- r'(\b|(?! =([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' + # NOQA: E501
27- r'(\b|(?! =([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' +
28- r'(\b|(?! =([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))'
24+ r'(\b|(?< =([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501
25+ r'(\b|(?< =([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501
26+ r'(\b|(?< =([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' + # NOQA: E501
27+ r'(\b|(?< =([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' +
28+ r'(\b|(?< =([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))'
2929)
3030
3131
Original file line number Diff line number Diff line change @@ -182,3 +182,26 @@ def test_crazy_date_past(self, *args):
182182 document = RasterisedDocumentParser ("/dev/null" )
183183 document .get_text ()
184184 self .assertIsNone (document .get_date ())
185+
186+ EXTRA = {
187+ "123/04/2020/3423" : None ,
188+ "-23/04/2020-foo" : "2020 04 23" ,
189+ "-23-04-2020-blurb" : "2020 04 23" ,
190+ # gets parsed as month: 23, day: 04, which is invalid
191+ #"-2020-04-23-bar": "2020 04 23",
192+ "12020-04-23-" : None ,
193+ "-2020-04-234" : None ,
194+ }
195+
196+ @mock .patch (MOCK_SCRATCH , SCRATCH )
197+ def test_date_format_bulk (self ):
198+ timezone = tz .gettz (settings .TIME_ZONE )
199+ for input , expected in self .EXTRA .items ():
200+ if expected is not None :
201+ raw = [int (x ) for x in expected .split ()]
202+ expected = datetime .datetime (* raw , tzinfo = timezone )
203+
204+ input_file = os .path .join (self .SAMPLE_FILES , "" )
205+ document = RasterisedDocumentParser (input_file )
206+ document ._text = input
207+ self .assertEqual (document .get_date (), expected , msg = "Test case {!r}" .format (input ))
You can’t perform that action at this time.
0 commit comments