-
-
Save bkaankuguoglu/111f9f5e0c30b5f57d7c5338d6dcb6fc to your computer and use it in GitHub Desktop.
| #=======================================================================# | |
| # extract_data.py # | |
| #=======================================================================# | |
| # usage: extract_data.py [-h] [-i INPUT_DIR] [-o OUTPUT_DIR] | |
| # | |
| # This program extracts provision numbers from a set of documents. | |
| # | |
| # optional arguments: | |
| # -h, --help show this help message and exit | |
| # -i INPUT_DIR, --input_dir INPUT_DIR | |
| # Input directory for the files to be modified | |
| # -o OUTPUT_DIR, --output_dir OUTPUT_DIR | |
| # Output directory for the files to be modified | |
| #=======================================================================# | |
| #=======================================================================# | |
| # Sample usage: # | |
| #=======================================================================# | |
| # python extract_data.py --input_dir ocr/data/ --output_dir ocr/results/ | |
| #=======================================================================# | |
| import numpy as np | |
| import os | |
| import cv2 | |
| import glob | |
| import shutil | |
| import pytesseract | |
| import re | |
| import time | |
| import argparse | |
| from statistics import mode | |
| regex = r"P\d{17}" | |
| found = {} | |
| results = {} | |
| queue = [] | |
| done = [] | |
| missing = [] | |
| pnr_area = [150, 450, 1600, 1150] # [start_x, start_y, end_x, end_y] | |
| # =============================================================================== # | |
| # To-do list # | |
| # =============================================================================== # | |
| # 0. Provision Number # | |
| # =============================================================================== # | |
| # =============================================================================== # | |
| # Threshold Methods # | |
| # =============================================================================== # | |
| # 1. Binary-Otsu w/ Gaussian Blur (kernel size = 9) # | |
| # 2. Binary-Otsu w/ Gaussian Blur (kernel size = 7) # | |
| # 3. Binary-Otsu w/ Gaussian Blur (kernel size = 5) # | |
| # 4. Binary-Otsu w/ Median Blur (kernel size = 5) # | |
| # 5. Binary-Otsu w/ Median Blur (kernel size = 3) # | |
| # 6. Adaptive Gaussian Threshold (31,2) w/ Gaussian Blur (kernel size = 5) # | |
| # 7. Adaptive Gaussian Threshold (31,2) w/ Median Blur (kernel size = 5) # | |
| # =============================================================================== # | |
| def apply_threshold(img, argument): | |
| switcher = { | |
| 1: cv2.threshold(cv2.GaussianBlur(img, (9, 9), 0), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1], | |
| 2: cv2.threshold(cv2.GaussianBlur(img, (7, 7), 0), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1], | |
| 3: cv2.threshold(cv2.GaussianBlur(img, (5, 5), 0), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1], | |
| 4: cv2.threshold(cv2.medianBlur(img, 5), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1], | |
| 5: cv2.threshold(cv2.medianBlur(img, 3), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1], | |
| 6: cv2.adaptiveThreshold(cv2.GaussianBlur(img, (5, 5), 0), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 2), | |
| 7: cv2.adaptiveThreshold(cv2.medianBlur(img, 3), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 2), | |
| } | |
| return switcher.get(argument, "Invalid method") | |
| def crop_image(img, start_x, start_y, end_x, end_y): | |
| cropped = img[start_y:end_y, start_x:end_x] | |
| return cropped | |
| def get_string(img_path, method): | |
| # Read image using opencv | |
| img = cv2.imread(img_path) | |
| file_name = os.path.basename(img_path).split('.')[0] | |
| file_name = file_name.split()[0] | |
| output_path = os.path.join(output_dir, file_name) | |
| if not os.path.exists(output_path): | |
| os.makedirs(output_path) | |
| # Crop the areas where provision number is more likely present | |
| img = crop_image(img, pnr_area[0], pnr_area[1], pnr_area[2], pnr_area[3]) | |
| # img = cv2.resize(img, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC) | |
| # Convert to gray | |
| img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) | |
| # Apply dilation and erosion to remove some noise | |
| kernel = np.ones((1, 1), np.uint8) | |
| img = cv2.dilate(img, kernel, iterations=1) | |
| img = cv2.erode(img, kernel, iterations=1) | |
| # Apply threshold to get image with only black and white | |
| img = apply_threshold(img, method) | |
| save_path = os.path.join(output_path, file_name + "_filter_" + str(method) + ".jpg") | |
| cv2.imwrite(save_path, img) | |
| # Recognize text with tesseract for python | |
| result = pytesseract.image_to_string(img, lang="eng") | |
| return result | |
| def find_match(regex, text): | |
| matches = re.finditer(regex, text, re.MULTILINE) | |
| target = "" | |
| for matchNum, match in enumerate(matches): | |
| matchNum = matchNum + 1 | |
| print(" Match {matchNum} was found at {start}-{end}: {match}".format(matchNum=matchNum, start=match.start(), | |
| end=match.end(), match=match.group())) | |
| target = match.group() | |
| return target | |
| def pretty_print(result_dict): | |
| s = '' | |
| for key in result_dict: | |
| s += '# ' + key + ': ' + result_dict[key] + '\n' | |
| return s | |
| if __name__ == '__main__': | |
| parser = argparse.ArgumentParser(description="This program extracts provision numbers from a set of documents.") | |
| parser.add_argument("-i", "--input_dir", help="Input directory for the files to be modified") | |
| parser.add_argument("-o", "--output_dir", help="Output directory for the files to be modified") | |
| args = parser.parse_args() | |
| input_dir = args.input_dir | |
| output_dir = args.output_dir | |
| if os.path.exists(output_dir): | |
| shutil.rmtree(output_dir) | |
| os.makedirs(output_dir) | |
| im_names = glob.glob(os.path.join(input_dir, '*.png')) + \ | |
| glob.glob(os.path.join(input_dir, '*.jpg')) + \ | |
| glob.glob(os.path.join(input_dir, '*.jpeg')) | |
| overall_start_t = time.time() | |
| for im_name in sorted(im_names): | |
| queue.append(im_name) | |
| print("The following files will be processed and their provision numbers will be extracted: {}\n".format(queue)) | |
| for im_name in im_names: | |
| start_time = time.time() | |
| print("*** The documents that are in the queue *** \n{}\n".format(queue)) | |
| print('#=======================================================') | |
| print(('# Regex is being applied on {:s}'.format(im_name))) | |
| print('#=======================================================') | |
| queue.remove(im_name) | |
| file_name = im_name.split(".")[0].split("/")[-1] | |
| i = 1 | |
| while i < 8: | |
| print("> The filter method " + str(i) + " is now being applied.") | |
| result = get_string(im_name, i) | |
| match = find_match(regex, result) | |
| if match: | |
| if file_name in found: | |
| found[file_name].append(match) | |
| else: | |
| list = [] | |
| list.append(match) | |
| found[file_name] = list | |
| f = open(os.path.join(output_dir, file_name, file_name + "_filter_" + str(i) + ".txt"), 'w') | |
| f.write(result) | |
| f.close() | |
| i += 1 | |
| pnr = '' | |
| if file_name in found: | |
| pnr = mode(found[file_name]) | |
| results[file_name] = pnr | |
| done.append(file_name) | |
| else: | |
| missing.append(file_name) | |
| end_time = time.time() | |
| print('#=======================================================\n' | |
| '# Results for: ' + file_name + '\n' | |
| '#=======================================================\n' | |
| '# The provision number: ' + pnr + '\n' | |
| '# It took ' + str(end_time-start_time) + ' seconds. \n' | |
| '#=======================================================\n') | |
| overall_end_t = time.time() | |
| print('#=======================================================\n' | |
| '# Summary \n' | |
| '#=======================================================\n' | |
| '# The documents that are successfully processed are: \n' + pretty_print(results) + | |
| '#=======================================================\n' | |
| '# The program failed to extract information from: \n' | |
| '# ' + str(missing) + '\n' | |
| '#=======================================================\n' | |
| '# It took ' + str(overall_end_t-overall_start_t) + ' seconds.\n' | |
| '#=======================================================\n') | |
Getting this error in terminal :
The following files will be processed and their provision numbers will be extracted: ['images/car_wash.png', 'images/lebron_james.jpg', 'images/sign.jpg']
*** The documents that are in the queue ***
['images/car_wash.png', 'images/lebron_james.jpg', 'images/sign.jpg']
#=======================================================
Regex is being applied on images/car_wash.png
#=======================================================
The filter method 1 is now being applied.
Traceback (most recent call last):
File "scale.py", line 203, in
result = get_string(im_name, i)
File "scale.py", line 114, in get_string
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
cv2.error: OpenCV(4.0.0) /Users/travis/build/skvark/opencv-python/opencv/modules/imgproc/src/color.cpp:181: error: (-215:Assertion failed) !_src.empty() in function 'cvtColor'
@ParfectShot comment out line 91 img = crop_image(img, pnr_area[0], pnr_area[1], pnr_area[2], pnr_area[3])
Better yet, you might want to change the pnr_area to match the area of the image you want to recognize.
but how issue is resolved with that?