package org.apache.uima.ruta.engine;

import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import org.apache.commons.lang3.StringUtils;
import org.apache.uima.ruta.visitor.CreatedByVisitor;
import org.htmlparser.Tag;
import org.htmlparser.Text;
import org.htmlparser.tags.ScriptTag;
import org.htmlparser.visitors.TextExtractingVisitor;

/* loaded from: input_file:ruta-core-2.1.0.jar:org/apache/uima/ruta/engine/HtmlConverterVisitor.class */
public class HtmlConverterVisitor extends TextExtractingVisitor {
    private boolean inBody = false;
    private boolean inScript = false;
    private boolean skipWhitespace = true;
    private SortedSet<HtmlConverterPSpan> textSpans = new TreeSet();
    private SortedSet<HtmlConverterPSpan> linebreaksFromHtmlTags = new TreeSet();
    private Set<String> newlineInducingTags;

    public HtmlConverterVisitor(Set<String> set) {
        this.newlineInducingTags = set;
    }

    @Override // org.htmlparser.visitors.TextExtractingVisitor, org.htmlparser.visitors.NodeVisitor
    public void visitStringNode(Text text) {
        super.visitStringNode(text);
        if (!this.inBody || this.inScript) {
            return;
        }
        if (this.skipWhitespace && StringUtils.isBlank(text.getText())) {
            return;
        }
        this.textSpans.add(new HtmlConverterPSpan(text.getStartPosition(), text.getEndPosition(), text.getText()));
    }

    @Override // org.htmlparser.visitors.TextExtractingVisitor, org.htmlparser.visitors.NodeVisitor
    public void visitTag(Tag tag) {
        super.visitTag(tag);
        String trim = tag.getTagName().toLowerCase().trim();
        if (trim.equals("body")) {
            this.inBody = true;
        } else if (trim.equals(CreatedByVisitor.FEATURE_SCRIPT)) {
            this.inScript = true;
        }
        if (this.newlineInducingTags.contains(trim)) {
            int startPosition = tag.getStartPosition();
            this.linebreaksFromHtmlTags.add(new HtmlConverterPSpanReplacement(startPosition, startPosition + 1, "\n"));
        }
    }

    @Override // org.htmlparser.visitors.TextExtractingVisitor, org.htmlparser.visitors.NodeVisitor
    public void visitEndTag(Tag tag) {
        String trim = tag.getTagName().toLowerCase().trim();
        if (trim.equals("body")) {
            this.inBody = false;
        } else if (trim.equals(CreatedByVisitor.FEATURE_SCRIPT) || (tag instanceof ScriptTag)) {
            this.inScript = false;
        }
    }

    public SortedSet<HtmlConverterPSpan> getTextSpans() {
        return this.textSpans;
    }

    public SortedSet<HtmlConverterPSpan> getLinebreaksFromHtmlTags() {
        return this.linebreaksFromHtmlTags;
    }
}
