
    6Fi
                     h    d dl Z d dlZd dlZd dlmZ d dlmZ d dlZd dl	Z	d dl
Z
 G d d          ZdS )    N)extract_text)convert_from_pathc                        e Zd Zd Zd Zd ZdS )PdfExtractorc                 B    t          j        t                    | _        d S )N)logging	getLogger__name__logger)selfs    BD:\xampp\htdocs\new-grp\engine\services\extractor\pdf_extractor.py__init__zPdfExtractor.__init__   s    '11    c                 V   	 t          j        t          j                            |                                        d          }| j                            d|            d}	 t          j	        |          }|D ]}||
                    d          z  }|                                 |                                r|                     |          S | j                            d           nA# t          $ r4}| j                            dt!          |                      Y d }~nd }~ww xY w	 t#          |          }|                                r|                     |          S | j                            d           nA# t          $ r4}| j                            dt!          |                      Y d }~nd }~ww xY w	 t%          |          }|D ]}|t'          j        |d	
          z  }|                     |          S # t          $ r4}| j                            dt!          |                      Y d }~nd }~ww xY w|                                st+          d          |S # t          $ r0}| j                            dt!          |                       d }~ww xY w)Nutf-8zExtracting text from PDF:  textz.No text found with PyMuPDF, trying PDFMiner...zPyMuPDF extraction failed: z*No text found with PDFMiner, trying OCR...zPDFMiner extraction failed: eng)langzOCR extraction failed: z'No text could be extracted from the PDFzPDF extraction failed: )base64	b64decodeurllibparseunquotedecoder   infofitzopenget_textclosestrip_detect_encoding	Exceptionerrorstrr   r   pytesseractimage_to_string
ValueError)	r   encoded_pathpdf_pathextracted_textdocpageeimagesimgs	            r   r   zPdfExtractor.extract_text   s   -	'(<(<\(J(JKKRRSZ[[HKD(DDEEEN
Ji)) < <D"dmmF&;&;;NN		!'')) A00@@@  !QRRRR J J J!!"HA"H"HIIIIIIIIJK!-h!7!7!'')) A00@@@  !MNNNN K K K!!"IQ"I"IJJJJJJJJKF*844! S SC"k&A#E&R&R&RRNN,,^<<< F F F!!"DCFF"D"DEEEEEEEEF "'')) L !JKKK!! 	 	 	K@A@@AAA	s   A#I. &A-C/ C/ .I. /
D-9*D(#I. (D--I. 17F )F I. 
G*F=8I. =GI. AH 
I*I<I. I'I. .
J(8+J##J(c                 B   	 t          j        |                                          }|d         }|r*|                    |                              dd          S |S # t          $ r6}| j                            dt          |                      |cY d}~S d}~ww xY w)z*Detect encoding and convert text to UTF-8.encodingr   ignore)errorszEncoding detection failed: N)chardetdetectencoder   r#   r   r$   r%   )r   r   resultr2   r.   s        r   r"   zPdfExtractor._detect_encoding>   s    	^DKKMM22Fj)H N{{8,,33GH3MMMK 	 	 	KDCFFDDEEEKKKKKK	s$   AA A 
B(+BBBN)r
   
__module____qualname__r   r   r"    r   r   r   r   
   sB        2 2 2. . .`
 
 
 
 
r   r   )r   urllib.parser   r   pdfminer.high_levelr   	pdf2imager   r&   r5   r   r   r;   r   r   <module>r?      s          , , , , , , ' ' ' ' ' '      > > > > > > > > > >r   