How to Build a Multilingual OCR AI Agent in Python with EasyOCR and OpenCV


class AdvancedOCRAgent:
   """
   Advanced OCR AI Agent with preprocessing, multi-language support,
   and intelligent text extraction capabilities.
   """
  
   def __init__(self, languages: List[str] = ['en'], gpu: bool = True):
       """Initialize OCR agent with specified languages."""
       print("🤖 Initializing Advanced OCR Agent...")
       self.languages = languages
       self.reader = easyocr.Reader(languages, gpu=gpu)
       self.confidence_threshold = 0.5
       print(f"✅ OCR Agent ready! Languages: {languages}")
  
   def upload_image(self) -> Optional[str]:
       """Upload image file through Colab interface."""
       print("📁 Upload your image file:")
       uploaded = files.upload()
       if uploaded:
           filename = list(uploaded.keys())[0]
           print(f"✅ Uploaded: {filename}")
           return filename
       return None
  
   def preprocess_image(self, image: np.ndarray, enhance: bool = True) -> np.ndarray:
       """Advanced image preprocessing for better OCR accuracy."""
       if len(image.shape) == 3:
           gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
       else:
           gray = image.copy()
      
       if enhance:
           clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
           gray = clahe.apply(gray)
          
           gray = cv2.fastNlMeansDenoising(gray)
          
           kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
           gray = cv2.filter2D(gray, -1, kernel)
      
       binary = cv2.adaptiveThreshold(
           gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
       )
      
       return binary
  
   def extract_text(self, image_path: str, preprocess: bool = True) -> Dict:
       """Extract text from image with advanced processing."""
       print(f"🔍 Processing image: {image_path}")
      
       image = cv2.imread(image_path)
       if image is None:
           raise ValueError(f"Could not load image: {image_path}")
      
       if preprocess:
           processed_image = self.preprocess_image(image)
       else:
           processed_image = image
      
       results = self.reader.readtext(processed_image)
      
       extracted_data = {
           'raw_results': results,
           'filtered_results': [],
           'full_text': '',
           'confidence_stats': {},
           'word_count': 0,
           'line_count': 0
       }
      
       high_confidence_text = []
       confidences = []
      
       for (bbox, text, confidence) in results:
           if confidence >= self.confidence_threshold:
               extracted_data['filtered_results'].append({
                   'text': text,
                   'confidence': confidence,
                   'bbox': bbox
               })
               high_confidence_text.append(text)
               confidences.append(confidence)
      
       extracted_data['full_text'] = ' '.join(high_confidence_text)
       extracted_data['word_count'] = len(extracted_data['full_text'].split())
       extracted_data['line_count'] = len(high_confidence_text)
      
       if confidences:
           extracted_data['confidence_stats'] = {
               'mean': np.mean(confidences),
               'min': np.min(confidences),
               'max': np.max(confidences),
               'std': np.std(confidences)
           }
      
       return extracted_data
  
   def visualize_results(self, image_path: str, results: Dict, show_bbox: bool = True):
       """Visualize OCR results with bounding boxes."""
       image = cv2.imread(image_path)
       image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
      
       plt.figure(figsize=(15, 10))
      
       if show_bbox:
           plt.subplot(2, 2, 1)
           img_with_boxes = image_rgb.copy()
          
           for item in results['filtered_results']:
               bbox = np.array(item['bbox']).astype(int)
               cv2.polylines(img_with_boxes, [bbox], True, (255, 0, 0), 2)
              
               x, y = bbox[0]
               cv2.putText(img_with_boxes, f"{item['confidence']:.2f}",
                          (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)
          
           plt.imshow(img_with_boxes)
           plt.title("OCR Results with Bounding Boxes")
           plt.axis('off')
      
       plt.subplot(2, 2, 2)
       processed = self.preprocess_image(image)
       plt.imshow(processed, cmap='gray')
       plt.title("Preprocessed Image")
       plt.axis('off')
      
       plt.subplot(2, 2, 3)
       confidences = [item['confidence'] for item in results['filtered_results']]
       if confidences:
           plt.hist(confidences, bins=20, alpha=0.7, color="blue")
           plt.xlabel('Confidence Score')
           plt.ylabel('Frequency')
           plt.title('Confidence Score Distribution')
           plt.axvline(self.confidence_threshold, color="red", linestyle="--",
                      label=f'Threshold: {self.confidence_threshold}')
           plt.legend()
      
       plt.subplot(2, 2, 4)
       stats = results['confidence_stats']
       if stats:
           labels = ['Mean', 'Min', 'Max']
           values = [stats['mean'], stats['min'], stats['max']]
           plt.bar(labels, values, color=['green', 'red', 'blue'])
           plt.ylabel('Confidence Score')
           plt.title('Confidence Statistics')
           plt.ylim(0, 1)
      
       plt.tight_layout()
       plt.show()
  
   def smart_text_analysis(self, text: str) -> Dict:
       """Perform intelligent analysis of extracted text."""
       analysis = {
           'language_detection': 'unknown',
           'text_type': 'unknown',
           'key_info': {},
           'patterns': []
       }
      
       email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
       phone_pattern = r'(\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
       url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
       date_pattern = r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b'
      
       patterns = {
           'emails': re.findall(email_pattern, text, re.IGNORECASE),
           'phones': re.findall(phone_pattern, text),
           'urls': re.findall(url_pattern, text, re.IGNORECASE),
           'dates': re.findall(date_pattern, text)
       }
      
       analysis['patterns'] = {k: v for k, v in patterns.items() if v}
      
       if any(patterns.values()):
           if patterns.get('emails') or patterns.get('phones'):
               analysis['text_type'] = 'contact_info'
           elif patterns.get('urls'):
               analysis['text_type'] = 'web_content'
           elif patterns.get('dates'):
               analysis['text_type'] = 'document_with_dates'
      
       if re.search(r'[а-яё]', text.lower()):
           analysis['language_detection'] = 'russian'
       elif re.search(r'[àáâãäåæçèéêëìíîïñòóôõöøùúûüý]', text.lower()):
           analysis['language_detection'] = 'romance_language'
       elif re.search(r'[一-龯]', text):
           analysis['language_detection'] = 'chinese'
       elif re.search(r'[ひらがなカタカナ]', text):
           analysis['language_detection'] = 'japanese'
       elif re.search(r'[a-zA-Z]', text):
           analysis['language_detection'] = 'latin_based'
      
       return analysis
  
   def process_batch(self, image_folder: str) -> List[Dict]:
       """Process multiple images in batch."""
       results = []
       supported_formats = ('.png', '.jpg', '.jpeg', '.bmp', '.tiff')
      
       for filename in os.listdir(image_folder):
           if filename.lower().endswith(supported_formats):
               image_path = os.path.join(image_folder, filename)
               try:
                   result = self.extract_text(image_path)
                   result['filename'] = filename
                   results.append(result)
                   print(f"✅ Processed: {filename}")
               except Exception as e:
                   print(f"❌ Error processing {filename}: {str(e)}")
      
       return results
  
   def export_results(self, results: Dict, format: str="json") -> str:
       """Export results in specified format."""
       if format.lower() == 'json':
           output = json.dumps(results, indent=2, ensure_ascii=False)
           filename="ocr_results.json"
       elif format.lower() == 'txt':
           output = results['full_text']
           filename="extracted_text.txt"
       else:
           raise ValueError("Supported formats: 'json', 'txt'")
      
       with open(filename, 'w', encoding='utf-8') as f:
           f.write(output)
      
       print(f"📄 Results exported to: {filename}")
       return filename



Source link

  • Related Posts

    Google’s Sensible Agent Reframes Augmented Reality (AR) Assistance as a Coupled “what+how” Decision—So What does that Change?

    Sensible Agent is an AI research framework and prototype from Google that chooses both the action an augmented reality (AR) agent should take and the interaction modality to deliver/confirm it,…

    Top Computer Vision CV Blogs & News Websites (2025)

    Computer vision moved fast in 2025: new multimodal backbones, larger open datasets, and tighter model–systems integration. Practitioners need sources that publish rigorously, link code and benchmarks, and track deployment patterns—not…

    Leave a Reply

    Your email address will not be published. Required fields are marked *