/*
 * Decompiled with CFR 0.152.
 */
package org.apache.manifoldcf.agents.transformation.htmlextractor;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InterruptedIOException;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.apache.manifoldcf.agents.interfaces.IOutputAddActivity;
import org.apache.manifoldcf.agents.interfaces.RepositoryDocument;
import org.apache.manifoldcf.agents.interfaces.ServiceInterruption;
import org.apache.manifoldcf.agents.transformation.BaseTransformationConnector;
import org.apache.manifoldcf.agents.transformation.htmlextractor.JsoupProcessing;
import org.apache.manifoldcf.agents.transformation.htmlextractor.Messages;
import org.apache.manifoldcf.agents.transformation.htmlextractor.exception.RegexException;
import org.apache.manifoldcf.core.interfaces.ConfigParams;
import org.apache.manifoldcf.core.interfaces.ConfigurationNode;
import org.apache.manifoldcf.core.interfaces.IHTTPOutput;
import org.apache.manifoldcf.core.interfaces.IPostParameters;
import org.apache.manifoldcf.core.interfaces.IThreadContext;
import org.apache.manifoldcf.core.interfaces.ManifoldCFException;
import org.apache.manifoldcf.core.interfaces.Specification;
import org.apache.manifoldcf.core.interfaces.SpecificationNode;
import org.apache.manifoldcf.core.interfaces.VersionContext;
import org.apache.manifoldcf.crawler.system.Logging;

public class HtmlExtractor
extends BaseTransformationConnector {
    public static final String _rcsid = "@(#)$Id$";
    protected static final String ACTIVITY_PROCESS = "process";
    protected static final String[] activitiesList = new String[]{"process"};
    private static final String EDIT_CONFIGURATION_JS = "editConfiguration.js";
    private static final String VIEW_CONFIGURATION_HTML = "viewConfiguration.html";
    private static final String EDIT_SPECIFICATION_JS = "editSpecification.js";
    private static final String VIEW_SPECIFICATION_HTML = "viewSpecification.html";
    private static final String EDIT_SPECIFICATION_HTML_EXTRACTOR_HTML = "editSpecification_HTML_Extractor.html";
    protected static final int HTML_STRIP_NONE = 0;
    protected static final int HTML_STRIP_ALL = 1;
    protected static int html_strip_usage = 1;
    public static final String NODE_KEEPMETADATA = "striphtml";
    public static final String NODE_FILTEREMPTY = "filterEmpty";
    public static final String ATTRIBUTE_SOURCE = "source";
    public static final String ATTRIBUTE_TARGET = "target";
    public static final String ATTRIBUTE_VALUE = "value";
    protected static final long inMemoryMaximumFile = 65536L;

    public String[] getActivitiesList() {
        return activitiesList;
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public int addOrReplaceDocumentWithException(String documentURI, VersionContext pipelineDescription, RepositoryDocument document, String authorityNameString, IOutputAddActivity activities) throws ManifoldCFException, ServiceInterruption, IOException {
        long startTime = System.currentTimeMillis();
        String resultCode = "OK";
        String description = null;
        Long length = null;
        SpecPacker sp = new SpecPacker(pipelineDescription.getSpecification());
        Logging.connectors.debug((Object)"Processing by HTML Extractor");
        if (!document.getMimeType().startsWith("text/html") || document.getMimeType().startsWith("application/xhtml+xml")) {
            Logging.connectors.debug((Object)"no processing, mime type not html");
            resultCode = "NO HTML";
        } else {
            try {
                Logging.connectors.debug((Object)"Document recognized as HTML - processing");
                long binaryLength = document.getBinaryLength();
                length = new Long(binaryLength);
                Hashtable<Object, Object> metadataExtracted = new Hashtable();
                metadataExtracted = JsoupProcessing.extractTextAndMetadataHtmlDocument(document.getBinaryStream(), (String)sp.includeFilters.get(0), sp.excludeFilters, sp.striphtml);
                ByteArrayInputStream newStream = new ByteArrayInputStream(((String)metadataExtracted.get("extractedDoc")).getBytes(StandardCharsets.UTF_8));
                int lenghtNewStream = ((InputStream)newStream).available();
                document.setBinary((InputStream)newStream, (long)lenghtNewStream);
                for (Map.Entry<Object, Object> entry : metadataExtracted.entrySet()) {
                    if (entry.getKey() == "extractedDoc") continue;
                    document.addField("jsoup_" + (String)entry.getKey(), (String)entry.getValue());
                }
                int n = activities.sendDocument(documentURI, document);
                return n;
            }
            catch (ServiceInterruption e) {
                resultCode = "SERVICEINTERRUPTION";
                description = e.getMessage();
                throw e;
            }
            catch (ManifoldCFException e) {
                resultCode = "EXCEPTION";
                description = e.getMessage();
                throw e;
            }
            catch (IOException e) {
                resultCode = "IOEXCEPTION";
                description = e.getMessage();
                throw e;
            }
            catch (Exception e) {
                resultCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
                description = e.getMessage();
            }
            finally {
                activities.recordActivity(new Long(startTime), ACTIVITY_PROCESS, length, documentURI, resultCode, description);
            }
        }
        return activities.sendDocument(documentURI, document);
    }

    private String matchingRegex(List<String> regexList, String str) throws RegexException {
        for (String regex : regexList) {
            try {
                Pattern pattern = Pattern.compile(regex);
                Matcher matcher = pattern.matcher(str);
                if (!matcher.find()) continue;
                return regex;
            }
            catch (PatternSyntaxException e) {
                throw new RegexException(regex, "Invalid regular expression");
            }
        }
        return null;
    }

    public void outputConfigurationHeader(IThreadContext threadContext, IHTTPOutput out, Locale locale, ConfigParams parameters, List<String> tabsArray) throws ManifoldCFException, IOException {
        Messages.outputResourceWithVelocity(out, locale, EDIT_CONFIGURATION_JS, null);
    }

    public void outputConfigurationBody(IThreadContext threadContext, IHTTPOutput out, Locale locale, ConfigParams parameters, String tabName) throws ManifoldCFException, IOException {
        HashMap<String, String> velocityContext = new HashMap<String, String>();
        velocityContext.put("TabName", tabName);
    }

    public String processConfigurationPost(IThreadContext threadContext, IPostParameters variableContext, Locale locale, ConfigParams parameters) throws ManifoldCFException {
        return null;
    }

    public void viewConfiguration(IThreadContext threadContext, IHTTPOutput out, Locale locale, ConfigParams parameters) throws ManifoldCFException, IOException {
        HashMap<String, Object> velocityContext = new HashMap<String, Object>();
        Messages.outputResourceWithVelocity(out, locale, VIEW_CONFIGURATION_HTML, velocityContext);
    }

    protected static void fillInHtmlExtractorSpecification(Map<String, Object> paramMap, Specification os) {
        ArrayList<String> includeFilters = new ArrayList<String>();
        ArrayList<String> excludeFilters = new ArrayList<String>();
        String striphtmlValue = "true";
        for (int i = 0; i < os.getChildCount(); ++i) {
            SpecificationNode sn = os.getChild(i);
            if (sn.getType().equals("includefilter")) {
                String includeFilter = sn.getAttributeValue("regex");
                if (includeFilter == null) continue;
                includeFilters.add(includeFilter);
                continue;
            }
            if (sn.getType().equals("excludefilter")) {
                String excludeFilter = sn.getAttributeValue("regex");
                if (excludeFilter == null) continue;
                excludeFilters.add(excludeFilter);
                continue;
            }
            if (!sn.getType().equals(NODE_KEEPMETADATA)) continue;
            striphtmlValue = sn.getAttributeValue(ATTRIBUTE_VALUE);
        }
        paramMap.put("INCLUDEFILTERS", includeFilters);
        paramMap.put("EXCLUDEFILTERS", excludeFilters);
        paramMap.put("HTMLTAGUSAGE", html_strip_usage);
        paramMap.put("STRIPHTML", striphtmlValue);
    }

    public void outputSpecificationHeader(IHTTPOutput out, Locale locale, Specification os, int connectionSequenceNumber, List<String> tabsArray) throws ManifoldCFException, IOException {
        HashMap<String, Object> paramMap = new HashMap<String, Object>();
        paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
        tabsArray.add(Messages.getString(locale, "HtmlExtractorTransformationConnector.HtmlExtractorTabName"));
        HtmlExtractor.fillInHtmlExtractorSpecification(paramMap, os);
        Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_JS, paramMap);
    }

    public void outputSpecificationBody(IHTTPOutput out, Locale locale, Specification os, int connectionSequenceNumber, int actualSequenceNumber, String tabName) throws ManifoldCFException, IOException {
        HashMap<String, Object> paramMap = new HashMap<String, Object>();
        paramMap.put("TABNAME", tabName);
        paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
        paramMap.put("SELECTEDNUM", Integer.toString(actualSequenceNumber));
        HtmlExtractor.fillInHtmlExtractorSpecification(paramMap, os);
        Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_HTML_EXTRACTOR_HTML, paramMap);
    }

    public String processSpecificationPost(IPostParameters variableContext, Locale locale, Specification os, int connectionSequenceNumber) throws ManifoldCFException {
        SpecificationNode node;
        String regex;
        String addop;
        SpecificationNode node2;
        String regex2;
        String op;
        String suffix;
        String prefix;
        int count;
        int i;
        String seqPrefix = "s" + connectionSequenceNumber + "_";
        String x = variableContext.getParameter(seqPrefix + "includefilter_count");
        if (x != null && x.length() > 0) {
            i = 0;
            while (i < os.getChildCount()) {
                SpecificationNode node3 = os.getChild(i);
                if (node3.getType().equals("includefilter")) {
                    os.removeChild(i);
                    continue;
                }
                ++i;
            }
            count = Integer.parseInt(x);
            for (i = 0; i < count; ++i) {
                prefix = seqPrefix + "includefilter_";
                suffix = "_" + Integer.toString(i);
                op = variableContext.getParameter(prefix + "op" + suffix);
                if (op != null && op.equals("Delete")) continue;
                regex2 = variableContext.getParameter(prefix + "regex" + suffix);
                node2 = new SpecificationNode("includefilter");
                node2.setAttribute("regex", regex2);
                os.addChild(os.getChildCount(), (ConfigurationNode)node2);
            }
            addop = variableContext.getParameter(seqPrefix + "includefilter_op");
            if (addop != null && addop.equals("Add")) {
                regex = variableContext.getParameter(seqPrefix + "includefilter_regex");
                node = new SpecificationNode("includefilter");
                node.setAttribute("regex", regex);
                os.addChild(os.getChildCount(), (ConfigurationNode)node);
            }
        }
        if ((x = variableContext.getParameter(seqPrefix + "excludefilter_count")) != null && x.length() > 0) {
            i = 0;
            while (i < os.getChildCount()) {
                SpecificationNode node4 = os.getChild(i);
                if (node4.getType().equals("excludefilter")) {
                    os.removeChild(i);
                    continue;
                }
                ++i;
            }
            count = Integer.parseInt(x);
            for (i = 0; i < count; ++i) {
                prefix = seqPrefix + "excludefilter_";
                suffix = "_" + Integer.toString(i);
                op = variableContext.getParameter(prefix + "op" + suffix);
                if (op != null && op.equals("Delete")) continue;
                regex2 = variableContext.getParameter(prefix + "regex" + suffix);
                node2 = new SpecificationNode("excludefilter");
                node2.setAttribute("regex", regex2);
                os.addChild(os.getChildCount(), (ConfigurationNode)node2);
            }
            addop = variableContext.getParameter(seqPrefix + "excludefilter_op");
            if (addop != null && addop.equals("Add")) {
                regex = variableContext.getParameter(seqPrefix + "excludefilter_regex");
                node = new SpecificationNode("excludefilter");
                node.setAttribute("regex", regex);
                os.addChild(os.getChildCount(), (ConfigurationNode)node);
            }
        }
        if ((x = variableContext.getParameter(seqPrefix + "striphtml_present")) != null && x.length() > 0) {
            SpecificationNode node5;
            String keepAll = variableContext.getParameter(seqPrefix + NODE_KEEPMETADATA);
            if (keepAll == null) {
                keepAll = "false";
            }
            int i2 = 0;
            while (i2 < os.getChildCount()) {
                node5 = os.getChild(i2);
                if (node5.getType().equals(NODE_KEEPMETADATA)) {
                    os.removeChild(i2);
                    continue;
                }
                ++i2;
            }
            node5 = new SpecificationNode(NODE_KEEPMETADATA);
            node5.setAttribute(ATTRIBUTE_VALUE, keepAll);
            os.addChild(os.getChildCount(), (ConfigurationNode)node5);
        }
        return null;
    }

    public void viewSpecification(IHTTPOutput out, Locale locale, Specification os, int connectionSequenceNumber) throws ManifoldCFException, IOException {
        HashMap<String, Object> paramMap = new HashMap<String, Object>();
        paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
        HtmlExtractor.fillInHtmlExtractorSpecification(paramMap, os);
        Messages.outputResourceWithVelocity(out, locale, VIEW_SPECIFICATION_HTML, paramMap);
    }

    protected static class SpecPacker {
        private final List<String> includeFilters = new ArrayList<String>();
        private final List<String> excludeFilters = new ArrayList<String>();
        private final boolean striphtml;

        public SpecPacker(Specification os) {
            boolean striphtml = true;
            for (int i = 0; i < os.getChildCount(); ++i) {
                String regex;
                SpecificationNode sn = os.getChild(i);
                if (sn.getType().equals("includefilter")) {
                    regex = sn.getAttributeValue("regex");
                    this.includeFilters.add(regex);
                }
                if (sn.getType().equals("excludefilter")) {
                    regex = sn.getAttributeValue("regex");
                    this.excludeFilters.add(regex);
                }
                if (!sn.getType().equals(HtmlExtractor.NODE_KEEPMETADATA)) continue;
                String value = sn.getAttributeValue(HtmlExtractor.ATTRIBUTE_VALUE);
                striphtml = Boolean.parseBoolean(value);
            }
            if (this.includeFilters.isEmpty()) {
                this.includeFilters.add("body");
            }
            this.striphtml = striphtml;
        }

        public String toPackedString() {
            StringBuilder sb = new StringBuilder();
            HtmlExtractor.packList((StringBuilder)sb, (List)this.includeFilters, (char)'+');
            HtmlExtractor.packList((StringBuilder)sb, (List)this.excludeFilters, (char)'+');
            if (this.striphtml) {
                sb.append('+');
            } else {
                sb.append('-');
            }
            return sb.toString();
        }
    }

    protected static class MemoryDestinationStorage
    implements DestinationStorage {
        protected final ByteArrayOutputStream outputStream;

        public MemoryDestinationStorage(int sizeHint) {
            this.outputStream = new ByteArrayOutputStream(sizeHint);
        }

        @Override
        public OutputStream getOutputStream() throws ManifoldCFException {
            return this.outputStream;
        }

        @Override
        public long getBinaryLength() throws ManifoldCFException {
            return this.outputStream.size();
        }

        @Override
        public InputStream getInputStream() throws ManifoldCFException {
            return new ByteArrayInputStream(this.outputStream.toByteArray());
        }

        @Override
        public void close() throws ManifoldCFException {
        }

        protected static int handleIOException(IOException e) throws ManifoldCFException {
            if (e instanceof InterruptedIOException) {
                throw new ManifoldCFException(e.getMessage(), (Throwable)e, 2);
            }
            throw new ManifoldCFException(e.getMessage(), (Throwable)e);
        }
    }

    protected static class FileDestinationStorage
    implements DestinationStorage {
        protected final File outputFile;
        protected final OutputStream outputStream;

        public FileDestinationStorage() throws ManifoldCFException {
            FileOutputStream outputStream;
            File outputFile;
            try {
                outputFile = File.createTempFile("mcftika", "tmp");
                outputStream = new FileOutputStream(outputFile);
            }
            catch (IOException e) {
                this.handleIOException(e);
                outputFile = null;
                outputStream = null;
            }
            this.outputFile = outputFile;
            this.outputStream = outputStream;
        }

        @Override
        public OutputStream getOutputStream() throws ManifoldCFException {
            return this.outputStream;
        }

        @Override
        public long getBinaryLength() throws ManifoldCFException {
            return this.outputFile.length();
        }

        @Override
        public InputStream getInputStream() throws ManifoldCFException {
            try {
                return new FileInputStream(this.outputFile);
            }
            catch (IOException e) {
                this.handleIOException(e);
                return null;
            }
        }

        private void handleIOException(IOException e) {
        }

        @Override
        public void close() throws ManifoldCFException {
            this.outputFile.delete();
        }
    }

    protected static interface DestinationStorage {
        public OutputStream getOutputStream() throws ManifoldCFException;

        public long getBinaryLength() throws ManifoldCFException;

        public InputStream getInputStream() throws ManifoldCFException;

        public void close() throws ManifoldCFException;
    }
}

