/*
 * Decompiled with CFR 0.152.
 */
package org.apache.ctakes.core.ae;

import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.ctakes.core.nlp.tokenizer.TokenizerPTB;
import org.apache.ctakes.core.pipeline.PipeBitInfo;
import org.apache.ctakes.typesystem.type.syntax.BaseToken;
import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
import org.apache.ctakes.typesystem.type.textspan.Segment;
import org.apache.ctakes.typesystem.type.textspan.Sentence;
import org.apache.log4j.Logger;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;

@PipeBitInfo(name="PTB Tokenizer", description="Annotates Document Penn TreeBank Tokens.", dependencies={PipeBitInfo.TypeProduct.SECTION, PipeBitInfo.TypeProduct.SENTENCE}, products={PipeBitInfo.TypeProduct.BASE_TOKEN})
public class TokenizerAnnotatorPTB
extends JCasAnnotator_ImplBase {
    private Logger logger = Logger.getLogger((String)((Object)((Object)this)).getClass().getName());
    public static final String PARAM_SEGMENTS_TO_SKIP = "SegmentsToSkip";
    @ConfigurationParameter(name="SegmentsToSkip", mandatory=false, description="Set of segments that can be skipped")
    private String[] skipSegmentsArray;
    private Set<String> skipSegmentsSet;
    private TokenizerPTB tokenizer;
    private int tokenCount = 0;
    static char CR = (char)13;
    static char LF = (char)10;

    public void initialize(UimaContext aContext) throws ResourceInitializationException {
        super.initialize(aContext);
        this.logger.info((Object)("Initializing " + ((Object)((Object)this)).getClass().getName()));
        this.tokenizer = new TokenizerPTB();
        this.skipSegmentsSet = new HashSet<String>();
        if (this.skipSegmentsArray != null) {
            Collections.addAll(this.skipSegmentsSet, this.skipSegmentsArray);
        }
    }

    public void process(JCas jcas) throws AnalysisEngineProcessException {
        this.logger.info((Object)("process(JCas) in " + ((Object)((Object)this)).getClass().getName()));
        this.tokenCount = 0;
        Collection segments = JCasUtil.select((JCas)jcas, Segment.class);
        for (Segment sa : segments) {
            String segmentID = sa.getId();
            if (this.skipSegmentsSet.contains(segmentID)) continue;
            this.annotateRange(jcas, sa.getBegin(), sa.getEnd());
        }
    }

    protected void annotateRange(JCas jcas, int rangeBegin, int rangeEnd) throws AnalysisEngineProcessException {
        String docText = jcas.getDocumentText();
        for (int i = rangeBegin; i < rangeEnd; ++i) {
            NewlineToken nta;
            if (docText.charAt(i) == CR) {
                if (i + 1 < rangeEnd && docText.charAt(i + 1) == LF) {
                    nta = new NewlineToken(jcas, i, i + 2);
                    ++i;
                } else {
                    nta = new NewlineToken(jcas, i, i + 1);
                }
                nta.addToIndexes();
                continue;
            }
            if (docText.charAt(i) != LF) continue;
            nta = new NewlineToken(jcas, i, i + 1);
            nta.addToIndexes();
        }
        Collection sentences = JCasUtil.select((JCas)jcas, Sentence.class);
        for (Sentence sentence : sentences) {
            if (sentence.getBegin() < rangeBegin || sentence.getEnd() > rangeEnd) continue;
            List<?> tokens = this.tokenizer.tokenizeTextSegment(jcas, sentence.getCoveredText(), sentence.getBegin(), true);
            for (Object bta : tokens) {
                if (bta == null) {
                    RuntimeException e = new RuntimeException("bta==null tokenCount=" + this.tokenCount + " tokens.size()==" + tokens.size());
                    e.printStackTrace();
                    continue;
                }
                if (BaseToken.class.isAssignableFrom(bta.getClass())) {
                    ((BaseToken)BaseToken.class.cast(bta)).addToIndexes();
                    continue;
                }
                throw new AnalysisEngineProcessException("Token returned cannot be cast as BaseToken", new Object[]{bta});
            }
        }
        Collection tokens = JCasUtil.select((JCas)jcas, BaseToken.class);
        for (BaseToken bta : tokens) {
            if (bta.getBegin() < rangeBegin || bta.getBegin() >= rangeEnd) continue;
            bta.setTokenNumber(this.tokenCount);
            ++this.tokenCount;
        }
    }

    public static AnalysisEngineDescription createAnnotatorDescription() throws ResourceInitializationException {
        return AnalysisEngineFactory.createEngineDescription(TokenizerAnnotatorPTB.class, (Object[])new Object[0]);
    }
}

