updated repo

sivakumar-mahalingam · Apr 8, 2024 · a9dd2cb · a9dd2cb
1 parent 9df81d8
commit a9dd2cb
Show file tree

Hide file tree

Showing 4 changed files with 59 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -12,14 +12,27 @@ This repository extracts the Machine Readable Zone (MRZ) from document images. T
 
 **️Features:**
 
-- Automatically detects and extracts the MRZ region from passport images.
-- Utilizes contour detection to accurately identify the MRZ area.
-- Outputs the extracted MRZ region as text for further processing or analysis.
+- Detects and extracts the MRZ region from document images
+- Contour detection to accurately identify the MRZ area
+- Custom trained models for Tensor and Tesseract 
+- Contains checksum logics for data validation
+- Outputs the extracted MRZ region as text/json for further processing or analysis
+
+
+## Built With
+
+![NumPy](https://img.shields.io/badge/numpy-%23013243.svg?style=for-the-badge&logo=numpy&logoColor=white)
+
+![Tensorflow](https://img.shields.io/badge/TensorFlow-FF6F00?style=for-the-badge&logo=tensorflow&logoColor=white)
+
+![OpenCV](https://img.shields.io/badge/OpenCV-27338e?style=for-the-badge&logo=OpenCV&logoColor=white)
+
+![Tesseract OCR](https://img.shields.io/badge/Tesseract%20OCR-0F9D58?style=for-the-badge&logo=google&logoColor=white)
 
 ## Installation
 
 1. Install `fastmrz` from pip
-    ```console
+    ```Console
     $ pip install fastmrz
     
     ---> 100%
@@ -41,24 +54,37 @@ fast_mrz = FastMRZ()
 # fast_mrz = FastMRZ(tesseract_path=r'/opt/homebrew/Cellar/tesseract/5.3.4_1/bin/tesseract') # Default path in Mac
 # fast_mrz = FastMRZ(tesseract_path=r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe') # Default path in Windows
 passport_mrz = fast_mrz.get_mrz(os.path.abspath('../data/passport_uk.jpg'))
+print("JSON:")
 print(json.dumps(passport_mrz, indent=4))
+
+print("\n")
+
+passport_mrz = fast_mrz.get_raw_mrz(os.path.abspath('../data/passport_uk.jpg'))
+print("TEXT:")
+print(passport_mrz)
 ```
 
 **OUTPUT:**
-```Python
+```Console
+JSON:
 {
-    "mrz_type": "TD3", 
-    "document_type": "P", 
-    "country_code": "GBR", 
-    "surname": "PUDARSAN", 
-    "given_name": "HENERT", 
-    "document_number": "707797979", 
-    "nationality": "GBR", 
-    "date_of_birth": "1995-05-20", 
-    "sex": "M", 
-    "date_of_expiry": "2017-04-22", 
+    "mrz_type": "TD3",
+    "document_type": "P",
+    "country_code": "GBR",
+    "surname": "PUDARSAN",
+    "given_name": "HENERT",
+    "document_number": "707797979",
+    "nationality": "GBR",
+    "date_of_birth": "1995-05-20",
+    "sex": "M",
+    "date_of_expiry": "2017-04-22",
     "status": "SUCCESS"
 }
+
+
+TEXT:
+P<GBRPUDARSAN<<HENERT<<<<<<<<<<<<<<<<<<<<<<<
+7077979792GBR9505209M1704224<<<<<<<<<<<<<<00
 ```
 
 ## MRZ Wiki
@@ -82,6 +108,10 @@ Now, based on the example of a national passport, let us take a closer look at t
 
 ![MRZ GIF](https://raw.githubusercontent.com/sivakumar-mahalingam/fastmrz/main/docs/mrz.gif)
 
+## ToDo
+
+- [ ] Test for mrva and mrvb documents
+
 ## License
 
 Distributed under the AGPL-3.0 License. See `LICENSE` for more information.

diff --git a/fastmrz/fastmrz.py b/fastmrz/fastmrz.py
@@ -118,11 +118,10 @@ def _parse_mrz(self, mrz_text):
 
         mrz_code_dict = {}
         if len(mrz_lines) == 2:
-            # add optional data field
             mrz_code_dict['mrz_type'] = 'TD2' if len(mrz_lines[0]) == 36 else 'TD3'
 
             # Line 1
-            mrz_code_dict['document_type'] = mrz_lines[0][:1]
+            mrz_code_dict['document_type'] = mrz_lines[0][:2].strip('<')
             mrz_code_dict['country_code'] = mrz_lines[0][2:5]
             names = mrz_lines[0][5:].split('<<')
             mrz_code_dict['surname'] = names[0].replace('<', ' ')
@@ -142,6 +141,10 @@ def _parse_mrz(self, mrz_text):
             if self._get_check_digit(mrz_code_dict['date_of_expiry']) != mrz_lines[1][27]:
                 return {'status': 'FAILURE', 'message': 'date of expiry checksum is not matching'}
             mrz_code_dict['date_of_expiry'] = self._format_date(mrz_code_dict['date_of_expiry'])
+            if mrz_code_dict['mrz_type'] == 'TD3':
+                mrz_code_dict['optional_data'] = mrz_lines[1][28:35].strip('<')
+
+            mrz_code_dict['optional_data'] = mrz_lines[1][28:35].strip('<') if mrz_code_dict['mrz_type'] == 'TD2' else mrz_lines[1][28:42].strip('<')
             if mrz_lines[1][-1] != self._get_final_check_digit(mrz_lines[1], mrz_code_dict['mrz_type']):
                 return {'status': 'FAILURE', 'message': 'final checksum is not matching'}
 
@@ -151,7 +154,7 @@ def _parse_mrz(self, mrz_text):
             mrz_code_dict['mrz_type'] = 'TD1'
 
             # Line 1
-            mrz_code_dict['document_type'] = mrz_lines[0][:2].replace('<', ' ')
+            mrz_code_dict['document_type'] = mrz_lines[0][:2].strip('<')
             mrz_code_dict['country_code'] = mrz_lines[0][2:5]
             mrz_code_dict['document_number'] = mrz_lines[0][5:14]
             if self._get_check_digit(mrz_code_dict['document_number']) != mrz_lines[0][14]:

diff --git a/fastmrz/main.py b/fastmrz/main.py
@@ -7,5 +7,12 @@
 # fast_mrz = FastMRZ(tesseract_path=r'/opt/homebrew/Cellar/tesseract/5.3.4_1/bin/tesseract') # Default path in Mac
 # fast_mrz = FastMRZ(tesseract_path=r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe') # Default path in Windows
 passport_mrz = fast_mrz.get_mrz(os.path.abspath('../data/passport_uk.jpg'))
+print("JSON:")
 print(json.dumps(passport_mrz, indent=4))
 
+print("\n")
+
+passport_mrz = fast_mrz.get_raw_mrz(os.path.abspath('../data/passport_uk.jpg'))
+print("TEXT:")
+print(passport_mrz)
+
diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 
 setup(
     name='fastmrz',
-    version='0.0.1',
+    version='1.0',
     author='Sivakumar Mahalingam',
     description='Extracts the Machine Readable Zone (MRZ) data from document images',
     long_description=long_description,