package com.adobe.internal.pdftoolkit.services.readingorder;

import com.adobe.internal.pdftoolkit.core.exceptions.PDFFontException;
import com.adobe.internal.pdftoolkit.core.exceptions.PDFIOException;
import com.adobe.internal.pdftoolkit.core.exceptions.PDFInvalidDocumentException;
import com.adobe.internal.pdftoolkit.core.exceptions.PDFSecurityException;
import com.adobe.internal.pdftoolkit.core.fontset.PDFFontSet;
import com.adobe.internal.pdftoolkit.core.types.ASCoordinate;
import com.adobe.internal.pdftoolkit.core.types.ASQuad;
import com.adobe.internal.pdftoolkit.pdf.document.PDFDocument;
import com.adobe.internal.pdftoolkit.pdf.page.PDFPage;
import com.adobe.internal.pdftoolkit.services.readingorder.impl.FindHVBreaks;
import com.adobe.internal.pdftoolkit.services.readingorder.impl.SortedWord;
import com.adobe.internal.pdftoolkit.services.textextraction.ParagraphIterator;
import com.adobe.internal.pdftoolkit.services.textextraction.SentenceIterator;
import com.adobe.internal.pdftoolkit.services.textextraction.TextExtractionOptions;
import com.adobe.internal.pdftoolkit.services.textextraction.TextExtractor;
import com.adobe.internal.pdftoolkit.services.textextraction.Word;
import com.adobe.internal.pdftoolkit.services.textextraction.WordsIterator;
import com.adobe.internal.pdftoolkit.services.textextraction.impl.Base14FontSetUtil;
import java.io.IOException;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.TreeMap;

/* loaded from: input_file:com/adobe/internal/pdftoolkit/services/readingorder/ReadingOrderTextExtractor.class */
public class ReadingOrderTextExtractor {
    private PDFDocument pdfDoc;
    private PDFFontSet fontSet;
    private boolean useStructureInfo;
    private TextExtractionOptions teOptions;
    private static final int trace = -1;
    private HashMap<Double, Integer> sFreq = new HashMap<>();
    TreeMap<Double, SortedWord> vLine = new TreeMap<>();
    TreeMap<Double, SortedWord> hLine = new TreeMap<>();
    List<List<List<List<Word>>>> wordsInDocument = new ArrayList();
    private boolean resolveHyphenation = false;

    /* loaded from: input_file:com/adobe/internal/pdftoolkit/services/readingorder/ReadingOrderTextExtractor$DocumentParagraphIterator.class */
    class DocumentParagraphIterator implements ParagraphIterator {
        private int pageIndex = 0;
        Iterator<PDFPage> pagesIter;
        ParagraphIterator wordsIter;

        DocumentParagraphIterator() throws PDFInvalidDocumentException, PDFSecurityException, PDFIOException, PDFFontException {
            this.pagesIter = ReadingOrderTextExtractor.this.pdfDoc.requirePages().iterator();
            if (this.pagesIter.hasNext()) {
                PDFPage next = this.pagesIter.next();
                this.pageIndex++;
                this.wordsIter = ReadingOrderTextExtractor.this.getParagraphIterator(next, this.pageIndex);
            }
        }

        @Override // com.adobe.internal.pdftoolkit.services.textextraction.ParagraphIterator
        public boolean hasNext() throws PDFInvalidDocumentException, PDFSecurityException, PDFIOException, PDFFontException {
            if (this.wordsIter.hasNext()) {
                return true;
            }
            if (!this.pagesIter.hasNext()) {
                return false;
            }
            while (this.pagesIter.hasNext() && !this.wordsIter.hasNext()) {
                PDFPage next = this.pagesIter.next();
                this.pageIndex++;
                this.wordsIter = ReadingOrderTextExtractor.this.getParagraphIterator(next, this.pageIndex);
                if (this.wordsIter.hasNext()) {
                    return this.wordsIter.hasNext();
                }
            }
            return false;
        }

        @Override // com.adobe.internal.pdftoolkit.services.textextraction.ParagraphIterator
        public List<List<Word>> next() throws PDFInvalidDocumentException, PDFSecurityException, PDFIOException, PDFFontException {
            if (hasNext()) {
                return this.wordsIter.next();
            }
            return null;
        }
    }

    /* loaded from: input_file:com/adobe/internal/pdftoolkit/services/readingorder/ReadingOrderTextExtractor$DocumentSentenceIterator.class */
    class DocumentSentenceIterator implements SentenceIterator {
        private int pageIndex = 0;
        Iterator<PDFPage> pagesIter;
        SentenceIterator wordsIter;

        DocumentSentenceIterator() throws PDFInvalidDocumentException, PDFSecurityException, PDFIOException, PDFFontException {
            this.pagesIter = ReadingOrderTextExtractor.this.pdfDoc.requirePages().iterator();
            if (this.pagesIter.hasNext()) {
                PDFPage next = this.pagesIter.next();
                this.pageIndex++;
                this.wordsIter = ReadingOrderTextExtractor.this.getSentenceIterator(next, this.pageIndex);
            }
        }

        @Override // com.adobe.internal.pdftoolkit.services.textextraction.SentenceIterator
        public boolean hasNext() throws PDFInvalidDocumentException, PDFSecurityException, PDFIOException, PDFFontException {
            if (this.wordsIter.hasNext()) {
                return true;
            }
            if (!this.pagesIter.hasNext()) {
                return false;
            }
            while (this.pagesIter.hasNext() && !this.wordsIter.hasNext()) {
                PDFPage next = this.pagesIter.next();
                this.pageIndex++;
                this.wordsIter = ReadingOrderTextExtractor.this.getSentenceIterator(next, this.pageIndex);
                if (this.wordsIter.hasNext()) {
                    return this.wordsIter.hasNext();
                }
            }
            return false;
        }

        @Override // com.adobe.internal.pdftoolkit.services.textextraction.SentenceIterator
        public String next() throws PDFInvalidDocumentException, PDFSecurityException, PDFIOException, PDFFontException {
            if (hasNext()) {
                return this.wordsIter.next();
            }
            return null;
        }
    }

    /* loaded from: input_file:com/adobe/internal/pdftoolkit/services/readingorder/ReadingOrderTextExtractor$DocumentWordsIterator.class */
    class DocumentWordsIterator implements WordsIterator {
        private int pageIndex = 0;
        Iterator<PDFPage> pagesIter;
        WordsIterator wordsIter;

        DocumentWordsIterator() throws PDFInvalidDocumentException, PDFSecurityException, PDFIOException, PDFFontException {
            this.pagesIter = ReadingOrderTextExtractor.this.pdfDoc.requirePages().iterator();
            if (this.pagesIter.hasNext()) {
                PDFPage next = this.pagesIter.next();
                this.pageIndex++;
                this.wordsIter = ReadingOrderTextExtractor.this.getWordsIterator(next, this.pageIndex);
            }
        }

        @Override // com.adobe.internal.pdftoolkit.services.textextraction.WordsIterator
        public boolean hasNext() throws PDFInvalidDocumentException, PDFSecurityException, PDFIOException, PDFFontException {
            if (this.wordsIter.hasNext()) {
                return true;
            }
            if (!this.pagesIter.hasNext()) {
                return false;
            }
            while (this.pagesIter.hasNext() && !this.wordsIter.hasNext()) {
                PDFPage next = this.pagesIter.next();
                this.pageIndex++;
                this.wordsIter = ReadingOrderTextExtractor.this.getWordsIterator(next, this.pageIndex);
                if (this.wordsIter.hasNext()) {
                    return this.wordsIter.hasNext();
                }
            }
            return false;
        }

        @Override // com.adobe.internal.pdftoolkit.services.textextraction.WordsIterator
        public Word next() throws PDFInvalidDocumentException, PDFSecurityException, PDFIOException, PDFFontException {
            if (hasNext()) {
                return this.wordsIter.next();
            }
            return null;
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:com/adobe/internal/pdftoolkit/services/readingorder/ReadingOrderTextExtractor$ParagraphListIterator.class */
    public static class ParagraphListIterator implements ParagraphIterator {
        Iterator<List<List<Word>>> wordsIter;

        ParagraphListIterator(List<List<List<Word>>> list) {
            this.wordsIter = list.iterator();
        }

        @Override // com.adobe.internal.pdftoolkit.services.textextraction.ParagraphIterator
        public boolean hasNext() throws PDFInvalidDocumentException, PDFSecurityException, PDFIOException, PDFFontException {
            return this.wordsIter.hasNext();
        }

        @Override // com.adobe.internal.pdftoolkit.services.textextraction.ParagraphIterator
        public List<List<Word>> next() throws PDFInvalidDocumentException, PDFSecurityException, PDFIOException, PDFFontException {
            return this.wordsIter.next();
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:com/adobe/internal/pdftoolkit/services/readingorder/ReadingOrderTextExtractor$SentenceListIterator.class */
    public static class SentenceListIterator implements SentenceIterator {
        Iterator<String> wordsIter;

        SentenceListIterator(List<String> list) {
            this.wordsIter = list.iterator();
        }

        @Override // com.adobe.internal.pdftoolkit.services.textextraction.SentenceIterator
        public boolean hasNext() throws PDFInvalidDocumentException, PDFSecurityException, PDFIOException, PDFFontException {
            return this.wordsIter.hasNext();
        }

        @Override // com.adobe.internal.pdftoolkit.services.textextraction.SentenceIterator
        public String next() throws PDFInvalidDocumentException, PDFSecurityException, PDFIOException, PDFFontException {
            return this.wordsIter.next();
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:com/adobe/internal/pdftoolkit/services/readingorder/ReadingOrderTextExtractor$WordListIterator.class */
    public static class WordListIterator implements WordsIterator {
        Iterator<Word> wordsIter;

        WordListIterator(List<Word> list) {
            this.wordsIter = list.iterator();
        }

        @Override // com.adobe.internal.pdftoolkit.services.textextraction.WordsIterator
        public boolean hasNext() {
            return this.wordsIter.hasNext();
        }

        @Override // com.adobe.internal.pdftoolkit.services.textextraction.WordsIterator
        public Word next() {
            return this.wordsIter.next();
        }
    }

    public static ReadingOrderTextExtractor newInstance(PDFDocument pDFDocument, PDFFontSet pDFFontSet) throws PDFInvalidDocumentException, PDFIOException, PDFSecurityException, PDFFontException {
        return new ReadingOrderTextExtractor(pDFDocument, pDFFontSet);
    }

    public static ReadingOrderTextExtractor newInstance(PDFDocument pDFDocument, PDFFontSet pDFFontSet, boolean z) throws PDFInvalidDocumentException, PDFIOException, PDFSecurityException, PDFFontException {
        return new ReadingOrderTextExtractor(pDFDocument, pDFFontSet, z);
    }

    public static ReadingOrderTextExtractor newInstance(PDFDocument pDFDocument, PDFFontSet pDFFontSet, TextExtractionOptions textExtractionOptions) throws PDFInvalidDocumentException, PDFIOException, PDFSecurityException, PDFFontException {
        return new ReadingOrderTextExtractor(pDFDocument, pDFFontSet, textExtractionOptions);
    }

    private ReadingOrderTextExtractor(PDFDocument pDFDocument, PDFFontSet pDFFontSet) throws PDFInvalidDocumentException, PDFIOException, PDFSecurityException, PDFFontException {
        this.pdfDoc = pDFDocument;
        this.fontSet = Base14FontSetUtil.buildBase14FontSet(pDFFontSet, pDFDocument);
    }

    private ReadingOrderTextExtractor(PDFDocument pDFDocument, PDFFontSet pDFFontSet, boolean z) throws PDFInvalidDocumentException, PDFIOException, PDFSecurityException, PDFFontException {
        this.pdfDoc = pDFDocument;
        this.useStructureInfo = z;
        this.fontSet = Base14FontSetUtil.buildBase14FontSet(pDFFontSet, pDFDocument);
    }

    private ReadingOrderTextExtractor(PDFDocument pDFDocument, PDFFontSet pDFFontSet, TextExtractionOptions textExtractionOptions) throws PDFInvalidDocumentException, PDFIOException, PDFSecurityException, PDFFontException {
        this.pdfDoc = pDFDocument;
        this.fontSet = Base14FontSetUtil.buildBase14FontSet(pDFFontSet, pDFDocument);
        this.teOptions = textExtractionOptions;
        this.useStructureInfo = textExtractionOptions.isUseStructure();
    }

    private void startingFrequency(Word word, List<ASQuad> list) throws PDFInvalidDocumentException, PDFIOException, PDFSecurityException {
        if (list == null || list.isEmpty()) {
            return;
        }
        Double d = new Double(Math.round((float) (new Double(list.get(0).p1().x()).doubleValue() * 100.0d)) / 100.0d);
        if (!this.sFreq.containsKey(d)) {
            this.sFreq.put(d, 1);
        } else {
            this.sFreq.put(d, Integer.valueOf(this.sFreq.get(d).intValue() + 1));
        }
    }

    private void pickHighStartingFreqs(HashMap<Double, Integer> hashMap) {
        if (this.sFreq.isEmpty()) {
            return;
        }
        for (Map.Entry<Double, Integer> entry : this.sFreq.entrySet()) {
            if (entry.getValue().intValue() > 10) {
                hashMap.put(entry.getKey(), entry.getValue());
            }
        }
    }

    private void columnBreaks(Word word, int i, List<ASQuad> list) throws PDFInvalidDocumentException, PDFIOException, PDFSecurityException {
        Double d;
        SortedWord sortedWord = new SortedWord(word, i);
        if (list == null || list.isEmpty()) {
            return;
        }
        ASCoordinate p1 = list.get(0).p1();
        double x = p1.x();
        double y = p1.y();
        Double d2 = new Double(x);
        while (true) {
            d = d2;
            if (!this.vLine.containsKey(d)) {
                break;
            }
            x += 1.0E-5d;
            d2 = Double.valueOf(x);
        }
        this.vLine.put(d, sortedWord);
        Double d3 = new Double(y);
        while (true) {
            Double d4 = d3;
            if (!this.hLine.containsKey(d4)) {
                this.hLine.put(d4, sortedWord);
                return;
            } else {
                y += 1.0E-5d;
                d3 = Double.valueOf(y);
            }
        }
    }

    private static void debug(int i, String str) {
        if (trace >= i) {
            System.out.print(str);
        }
    }

    private TreeMap<Double, SortedWord> getVLine() {
        return this.vLine;
    }

    private TreeMap<Double, SortedWord> getHLine() {
        return this.hLine;
    }

    public List<List<List<List<Word>>>> getWordsInDocument() {
        return this.wordsInDocument;
    }

    public void setResolveHyphenation(boolean z) {
        this.resolveHyphenation = z;
    }

    public boolean isResolveHyphenation() {
        return this.resolveHyphenation;
    }

    public List<Word> getReadingOrderedTextFromPDF(PDFDocument pDFDocument, PDFFontSet pDFFontSet, PDFPage pDFPage, int i) throws PDFInvalidDocumentException, PDFSecurityException, PDFIOException, PDFFontException {
        int i2;
        WordsIterator rOTEWordsIterator = TextExtractor.newInstance(pDFDocument, pDFFontSet, this.teOptions).getROTEWordsIterator(pDFPage, i);
        ArrayList arrayList = new ArrayList();
        HashMap<Double, Integer> hashMap = new HashMap<>();
        int i3 = 0;
        int i4 = 0;
        while (true) {
            i2 = i4;
            if (!rOTEWordsIterator.hasNext()) {
                break;
            }
            Word next = rOTEWordsIterator.next();
            List<ASQuad> boundingQuads = next.getBoundingQuads();
            if (i2 != 0 && next.getPageNumber() != i2 && next.getPageNumber() == i) {
                arrayList = new ArrayList();
                debug(1, "For Page" + i2 + ", start vertical/horizontal projections ==> ");
                pickHighStartingFreqs(hashMap);
                if (getVLine().size() > 0) {
                    FindHVBreaks findHVBreaks = new FindHVBreaks();
                    findHVBreaks.setStartingFrequency(hashMap);
                    findHVBreaks.processDetermineBreaks(getVLine(), getHLine());
                    debug(1, "blocks determined ==> ");
                    findHVBreaks.breakWithStartingFreqs(findHVBreaks.allGroupsWithPrior);
                    debug(1, "write reading-order text ==> ");
                    findHVBreaks.printReadingOrderText(findHVBreaks.allGroupsWithPrior, arrayList);
                    this.wordsInDocument.add(findHVBreaks.getWordsInPage());
                    debug(1, "Finished.\n");
                }
                this.vLine = new TreeMap<>();
                this.hLine = new TreeMap<>();
                this.sFreq.clear();
                hashMap.clear();
                i3 = 0;
            }
            startingFrequency(next, boundingQuads);
            columnBreaks(next, i3, boundingQuads);
            i3++;
            i4 = next.getPageNumber();
        }
        if (getVLine().size() > 0) {
            debug(1, "For Page" + i2 + ", start vertical/horizontal projections ==> ");
            pickHighStartingFreqs(hashMap);
            FindHVBreaks findHVBreaks2 = new FindHVBreaks();
            findHVBreaks2.setStartingFrequency(hashMap);
            findHVBreaks2.processDetermineBreaks(getVLine(), getHLine());
            debug(1, "blocks determined ==> ");
            findHVBreaks2.breakWithStartingFreqs(findHVBreaks2.allGroupsWithPrior);
            debug(1, "write reading-order text ==> ");
            findHVBreaks2.printReadingOrderText(findHVBreaks2.allGroupsWithPrior, arrayList);
            this.wordsInDocument.add(findHVBreaks2.getWordsInPage());
            debug(1, "Finished.\n");
        }
        return arrayList;
    }

    public List<String> buildSentences(List<List<Word>> list) throws PDFInvalidDocumentException, PDFIOException, PDFSecurityException {
        StringBuilder sb = new StringBuilder();
        ArrayList arrayList = new ArrayList();
        for (int i = 0; i < list.size(); i++) {
            List<Word> list2 = list.get(i);
            for (int i2 = 0; i2 < list2.size(); i2++) {
                sb.append(list2.get(i2).toString());
            }
        }
        String sb2 = sb.toString();
        if (!isWordEmpty(sb2)) {
            BreakIterator sentenceInstance = BreakIterator.getSentenceInstance(Locale.getDefault());
            sentenceInstance.setText(sb2);
            int first = sentenceInstance.first();
            int next = sentenceInstance.next();
            while (true) {
                int i3 = next;
                if (i3 == trace) {
                    break;
                }
                int i4 = i3;
                int i5 = first;
                if (i3 > sb2.length()) {
                    i4 = sb2.length();
                    i5 = i4;
                }
                arrayList.add(sb2.substring(i5, i4));
                first = i3;
                next = sentenceInstance.next();
            }
        }
        return arrayList;
    }

    private boolean isWordEmpty(String str) {
        return str != null && str.length() == 0;
    }

    public ParagraphIterator getParagraphIterator() throws PDFInvalidDocumentException, PDFIOException, PDFSecurityException, PDFFontException, IOException {
        return new DocumentParagraphIterator();
    }

    public ParagraphIterator getParagraphIterator(PDFPage pDFPage, int i) throws PDFInvalidDocumentException, PDFIOException, PDFSecurityException, PDFFontException {
        return new ParagraphListIterator(extractParagraphs(pDFPage, i));
    }

    public SentenceIterator getSentenceIterator() throws PDFInvalidDocumentException, PDFIOException, PDFSecurityException, PDFFontException, IOException {
        return new DocumentSentenceIterator();
    }

    public SentenceIterator getSentenceIterator(PDFPage pDFPage, int i) throws PDFInvalidDocumentException, PDFIOException, PDFSecurityException, PDFFontException {
        return new SentenceListIterator(extractSentences(pDFPage, i));
    }

    private List<String> extractSentences(PDFPage pDFPage, int i) throws PDFInvalidDocumentException, PDFIOException, PDFSecurityException, PDFFontException {
        ArrayList arrayList = new ArrayList();
        ReadingOrderTextExtractor readingOrderTextExtractor = this.teOptions != null ? new ReadingOrderTextExtractor(this.pdfDoc, this.fontSet, this.teOptions) : new ReadingOrderTextExtractor(this.pdfDoc, this.fontSet, this.useStructureInfo);
        readingOrderTextExtractor.getReadingOrderedTextFromPDF(this.pdfDoc, this.fontSet, pDFPage, i);
        List<List<List<List<Word>>>> wordsInDocument = readingOrderTextExtractor.getWordsInDocument();
        for (int i2 = 0; i2 < wordsInDocument.size(); i2++) {
            List<List<List<Word>>> list = wordsInDocument.get(i2);
            for (int i3 = 0; i3 < list.size(); i3++) {
                arrayList.addAll(readingOrderTextExtractor.buildSentences(list.get(i3)));
            }
        }
        return arrayList;
    }

    private List<List<List<Word>>> extractParagraphs(PDFPage pDFPage, int i) throws PDFInvalidDocumentException, PDFIOException, PDFSecurityException, PDFFontException {
        ArrayList arrayList = new ArrayList();
        ReadingOrderTextExtractor readingOrderTextExtractor = this.teOptions != null ? new ReadingOrderTextExtractor(this.pdfDoc, this.fontSet, this.teOptions) : new ReadingOrderTextExtractor(this.pdfDoc, this.fontSet, this.useStructureInfo);
        readingOrderTextExtractor.getReadingOrderedTextFromPDF(this.pdfDoc, this.fontSet, pDFPage, i);
        if (readingOrderTextExtractor.getWordsInDocument().size() > 0) {
            arrayList.addAll(0, readingOrderTextExtractor.getWordsInDocument().get(0));
        }
        return arrayList;
    }

    public WordsIterator getWordsIterator() throws PDFInvalidDocumentException, PDFIOException, PDFSecurityException, PDFFontException, IOException {
        return new DocumentWordsIterator();
    }

    public WordsIterator getWordsIterator(PDFPage pDFPage, int i) throws PDFInvalidDocumentException, PDFIOException, PDFSecurityException, PDFFontException {
        return new WordListIterator(extractWords(pDFPage, i));
    }

    private List<Word> extractWords(PDFPage pDFPage, int i) throws PDFInvalidDocumentException, PDFIOException, PDFSecurityException, PDFFontException {
        ArrayList arrayList = new ArrayList();
        arrayList.addAll(0, (this.teOptions != null ? new ReadingOrderTextExtractor(this.pdfDoc, this.fontSet, this.teOptions) : new ReadingOrderTextExtractor(this.pdfDoc, this.fontSet, this.useStructureInfo)).getReadingOrderedTextFromPDF(this.pdfDoc, this.fontSet, pDFPage, i));
        return arrayList;
    }

    public boolean isUseStructureInfo() {
        return this.useStructureInfo;
    }

    public void setUseStructureInfo(boolean z) {
        this.useStructureInfo = z;
    }
}
