ocr_extract.py

import pytesseract
from dotenv import load_dotenv
import os
from PIL import Image


class OCRExtractor:
    def __init__(self):
        load_dotenv()
        self.tesseract_path = os.getenv("TESSERACT_PATH")
        pytesseract.pytesseract.tesseract_cmd = self.tesseract_path

    def read_image(self, image_path):
        try:
            image = Image.open(image_path)
            return image
        except Exception as e:
            print(f"Error reading image: {e}")
            return None

    def extract_text_from_image(self, image):
        try:
            text = pytesseract.image_to_string(image, lang="eng")
            return text
        except Exception as e:
            print(f"Error extracting text: {e}")
            return ""

    def save_text_to_file(self, text, output_path):
        try:
            with open(output_path, "w", encoding="utf-8") as file:
                file.write(text)
            print("Successfully extracted data")
        except Exception as e:
            print(f"Error saving text to file: {e}")