Skip to content

Commit

Permalink
updated repo
Browse files Browse the repository at this point in the history
  • Loading branch information
sivakumar-mahalingam committed Apr 8, 2024
1 parent 9df81d8 commit a9dd2cb
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 19 deletions.
60 changes: 45 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,27 @@ This repository extracts the Machine Readable Zone (MRZ) from document images. T

**️Features:**

- Automatically detects and extracts the MRZ region from passport images.
- Utilizes contour detection to accurately identify the MRZ area.
- Outputs the extracted MRZ region as text for further processing or analysis.
- Detects and extracts the MRZ region from document images
- Contour detection to accurately identify the MRZ area
- Custom trained models for Tensor and Tesseract
- Contains checksum logics for data validation
- Outputs the extracted MRZ region as text/json for further processing or analysis


## Built With

![NumPy](https://img.shields.io/badge/numpy-%23013243.svg?style=for-the-badge&logo=numpy&logoColor=white)

![Tensorflow](https://img.shields.io/badge/TensorFlow-FF6F00?style=for-the-badge&logo=tensorflow&logoColor=white)

![OpenCV](https://img.shields.io/badge/OpenCV-27338e?style=for-the-badge&logo=OpenCV&logoColor=white)

![Tesseract OCR](https://img.shields.io/badge/Tesseract%20OCR-0F9D58?style=for-the-badge&logo=google&logoColor=white)

## Installation

1. Install `fastmrz` from pip
```console
```Console
$ pip install fastmrz
---> 100%
Expand All @@ -41,24 +54,37 @@ fast_mrz = FastMRZ()
# fast_mrz = FastMRZ(tesseract_path=r'/opt/homebrew/Cellar/tesseract/5.3.4_1/bin/tesseract') # Default path in Mac
# fast_mrz = FastMRZ(tesseract_path=r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe') # Default path in Windows
passport_mrz = fast_mrz.get_mrz(os.path.abspath('../data/passport_uk.jpg'))
print("JSON:")
print(json.dumps(passport_mrz, indent=4))

print("\n")

passport_mrz = fast_mrz.get_raw_mrz(os.path.abspath('../data/passport_uk.jpg'))
print("TEXT:")
print(passport_mrz)
```

**OUTPUT:**
```Python
```Console
JSON:
{
"mrz_type": "TD3",
"document_type": "P",
"country_code": "GBR",
"surname": "PUDARSAN",
"given_name": "HENERT",
"document_number": "707797979",
"nationality": "GBR",
"date_of_birth": "1995-05-20",
"sex": "M",
"date_of_expiry": "2017-04-22",
"mrz_type": "TD3",
"document_type": "P",
"country_code": "GBR",
"surname": "PUDARSAN",
"given_name": "HENERT",
"document_number": "707797979",
"nationality": "GBR",
"date_of_birth": "1995-05-20",
"sex": "M",
"date_of_expiry": "2017-04-22",
"status": "SUCCESS"
}


TEXT:
P<GBRPUDARSAN<<HENERT<<<<<<<<<<<<<<<<<<<<<<<
7077979792GBR9505209M1704224<<<<<<<<<<<<<<00
```

## MRZ Wiki
Expand All @@ -82,6 +108,10 @@ Now, based on the example of a national passport, let us take a closer look at t

![MRZ GIF](https://raw.githubusercontent.com/sivakumar-mahalingam/fastmrz/main/docs/mrz.gif)

## ToDo

- [ ] Test for mrva and mrvb documents

## License

Distributed under the AGPL-3.0 License. See `LICENSE` for more information.
Expand Down
9 changes: 6 additions & 3 deletions fastmrz/fastmrz.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,11 +118,10 @@ def _parse_mrz(self, mrz_text):

mrz_code_dict = {}
if len(mrz_lines) == 2:
# add optional data field
mrz_code_dict['mrz_type'] = 'TD2' if len(mrz_lines[0]) == 36 else 'TD3'

# Line 1
mrz_code_dict['document_type'] = mrz_lines[0][:1]
mrz_code_dict['document_type'] = mrz_lines[0][:2].strip('<')
mrz_code_dict['country_code'] = mrz_lines[0][2:5]
names = mrz_lines[0][5:].split('<<')
mrz_code_dict['surname'] = names[0].replace('<', ' ')
Expand All @@ -142,6 +141,10 @@ def _parse_mrz(self, mrz_text):
if self._get_check_digit(mrz_code_dict['date_of_expiry']) != mrz_lines[1][27]:
return {'status': 'FAILURE', 'message': 'date of expiry checksum is not matching'}
mrz_code_dict['date_of_expiry'] = self._format_date(mrz_code_dict['date_of_expiry'])
if mrz_code_dict['mrz_type'] == 'TD3':
mrz_code_dict['optional_data'] = mrz_lines[1][28:35].strip('<')

mrz_code_dict['optional_data'] = mrz_lines[1][28:35].strip('<') if mrz_code_dict['mrz_type'] == 'TD2' else mrz_lines[1][28:42].strip('<')
if mrz_lines[1][-1] != self._get_final_check_digit(mrz_lines[1], mrz_code_dict['mrz_type']):
return {'status': 'FAILURE', 'message': 'final checksum is not matching'}

Expand All @@ -151,7 +154,7 @@ def _parse_mrz(self, mrz_text):
mrz_code_dict['mrz_type'] = 'TD1'

# Line 1
mrz_code_dict['document_type'] = mrz_lines[0][:2].replace('<', ' ')
mrz_code_dict['document_type'] = mrz_lines[0][:2].strip('<')
mrz_code_dict['country_code'] = mrz_lines[0][2:5]
mrz_code_dict['document_number'] = mrz_lines[0][5:14]
if self._get_check_digit(mrz_code_dict['document_number']) != mrz_lines[0][14]:
Expand Down
7 changes: 7 additions & 0 deletions fastmrz/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,12 @@
# fast_mrz = FastMRZ(tesseract_path=r'/opt/homebrew/Cellar/tesseract/5.3.4_1/bin/tesseract') # Default path in Mac
# fast_mrz = FastMRZ(tesseract_path=r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe') # Default path in Windows
passport_mrz = fast_mrz.get_mrz(os.path.abspath('../data/passport_uk.jpg'))
print("JSON:")
print(json.dumps(passport_mrz, indent=4))

print("\n")

passport_mrz = fast_mrz.get_raw_mrz(os.path.abspath('../data/passport_uk.jpg'))
print("TEXT:")
print(passport_mrz)

2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

setup(
name='fastmrz',
version='0.0.1',
version='1.0',
author='Sivakumar Mahalingam',
description='Extracts the Machine Readable Zone (MRZ) data from document images',
long_description=long_description,
Expand Down

0 comments on commit a9dd2cb

Please sign in to comment.