/*
 * @copyright Copyright (c) Open-Xchange GmbH, Germany <info@open-xchange.com>
 * @license AGPL-3.0
 *
 * This code is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with OX App Suite.  If not, see <https://www.gnu.org/licenses/agpl-3.0.txt>.
 *
 * Any use of the work other than as authorized under this license or copyright law is prohibited.
 *
 */

package com.openexchange.java;

import static com.openexchange.java.Strings.isNonWordCharacter;
import static com.openexchange.java.Strings.toLowerCase;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.function.IntFunction;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import gnu.trove.map.TCharObjectMap;
import gnu.trove.map.hash.TCharObjectHashMap;

/**
 * {@link HTMLDetector} - Detects HTML tags in a byte sequence.
 *
 * @author <a href="mailto:thorben.betten@open-xchange.com">Thorben Betten</a>
 */
public final class HTMLDetector {

    private static final Set<String> JS_EVENT_HANDLERS = ImmutableSet.of(
        "onabort",
        "onactivate",
        "onafterprint",
        "onanimationend",
        "onanimationiteration",
        "onanimationstart",

        "onbeforeprint",
        "onbeforeunload",
        "onblur",

        "oncanplay",
        "oncanplaythrough",
        "onchange",
        "onclick ",
        "oncontextmenu",
        "oncopy",
        "oncuechange",
        "oncut",

        "ondblclick",
        "ondomcontentloaded",
        "ondrag",
        "ondragend",
        "ondragenter",
        "ondragleave",
        "ondragover",
        "ondragstart",
        "ondrop",
        "ondurationchange",

        "onemptied",
        "onended",
        "onerror",

        "onfocus",
        "onfocusin",
        "onfocusout",

        "ongotpointercapture",

        "onhashchange",

        "oninput",
        "oninvalid",

        "onjavascript",

        "onkeydown",
        "onkeypress",
        "onkeyup",

        "onlanguagechange",
        "onload",
        "onloadeddata",
        "onloadedmetadata",
        "onloadstart",
        "onlostpointercapture",

        "onmessage",
        "onmousedown",
        "onmouseenter",
        "onmouseleave",
        "onmousemove",
        "onmouseout",
        "onmouseover",
        "onmouseup",
        "onmousewheel",

        "onoffline",
        "ononline",

        "onpageshow",
        "onpagehide",
        "onpaste",
        "onpause",
        "onplay",
        "onplaying",
        "onpointercancel",
        "onpointerdown",
        "onpointerenter",
        "onpointerleave",
        "onpointermove",
        "onpointerout",
        "onpointerover",
        "onpointerup",
        "onpopstate",
        "onprogress",

        "onratechange",
        "onrejectionhandled",
        "onreset",
        "onresize",

        "onscroll",
        "onsearch",
        "onseeked",
        "onseeking",
        "onselect",
        "onshow",
        "onstalled",
        "onstorage",
        "onsubmit",
        "onsuspend",

        "ontimeupdate",
        "ontoggle",
        "ontouchcancel",
        "ontouchend",
        "ontouchmove",
        "ontouchstart",
        "ontransitioned",

        "onunhandledrejection",
        "onunload",

        "onvolumechange",

        "onwaiting",
        "onwheel",

        "onzoom");

    private static final IntFunction<List<String>> JS_EVENT_HANDLER_MAP = createMappingFunction();


    /**
     * Gets the default global event handler provider.
     * <p>
     * <b>Only used for tests.</b>
     *
     * @return The default global event handler provider
     */
    static IntFunction<List<String>> getEventHandlerProvider() {
        // For tests only
        return JS_EVENT_HANDLER_MAP;
    }

    private static IntFunction<List<String>> createMappingFunction() {
        TCharObjectMap<ImmutableList.Builder<String>> tmp = new TCharObjectHashMap<>(26);
        for (String name : JS_EVENT_HANDLERS) {
            // E.g. "onerror" -> 'e'
            createIfAbsent(name.charAt(2), tmp).add(name);
        }

        TCharObjectMap<List<String>> map = new TCharObjectHashMap<>(tmp.size());
        tmp.forEachEntry((character, builder) -> {
            map.put(character, builder.build());
            return true;
        });
        tmp = null;

        return ch -> {
            List<String> list = map.get((char) ch);
            return list == null ? Collections.emptyList() : list;
        };
    }

    private static ImmutableList.Builder<String> createIfAbsent(char ch, TCharObjectMap<ImmutableList.Builder<String>> map) {
        ImmutableList.Builder<String> b = map.get(ch);
        if (b == null) {
            // Create list builder & add to map
            b = ImmutableList.builderWithExpectedSize(4);
            map.put(ch, b);
        }
        return b;
    }

    /** The special event handler provider telling to skip detection of possible event handlers */
    public static final IntFunction<List<String>> SKIP_EVENT_HANDLER_DETECTION = ch -> Collections.emptyList();

    /**
     * Initializes a new {@link HTMLDetector}.
     */
    private HTMLDetector() {
        super();
    }

    /**
     * Checks if given string contains common HTML tags.
     *
     * @param sequence The string to check
     * @param strict <code>true</code> for strict checking; otherwise <code>false</code>
     * @param eventHandlerProvider The optional event handler provider to consider
     * @return <code>true</code> if given String contains common HTML tags; otherwise <code>false</code>
     * @see #SKIP_EVENT_HANDLER_DETECTION
     */
    public static boolean containsHTMLTags(final String sequence, final boolean strict, IntFunction<List<String>> eventHandlerProvider) {
        return strict ? containsHTMLTags(sequence, eventHandlerProvider, "<br", "<p>") : containsHTMLTags(sequence, eventHandlerProvider);
    }

    /**
     * Checks if given string contains common HTML tags.
     *
     * @param sequence The string to check
     * @param tags Additional tags to look for
     * @return <code>true</code> if given String contains common HTML tags; otherwise <code>false</code>
     */
    public static boolean containsHTMLTags(final String sequence, final String... tags) {
        return containsHTMLTags(sequence, JS_EVENT_HANDLER_MAP, tags);
    }

    /**
     * Checks if given string contains common HTML tags.
     *
     * @param sequence The string to check
     * @param eventHandlerProvider The optional event handler provider to consider
     * @param tags Additional tags to look for
     * @return <code>true</code> if given String contains common HTML tags; otherwise <code>false</code>
     * @see #SKIP_EVENT_HANDLER_DETECTION
     */
    public static boolean containsHTMLTags(String sequence, IntFunction<List<String>> eventHandlerProvider, String... tags) {
        if (sequence == null) {
            throw new IllegalArgumentException("Sequence must not be null");
        }

        String lc = Strings.asciiLowerCase(sequence);
        if (lc.indexOf('<') >= 0) {
            if ((lc.indexOf("html>") >= 0)) {
                return true;
            }
            if ((lc.indexOf("head>") >= 0)) {
                return true;
            }
            if ((lc.indexOf("body>") >= 0)) {
                return true;
            }
            if ((lc.indexOf("<script") >= 0)) {
                return true;
            }
            if ((lc.indexOf("<img") >= 0)) {
                return true;
            }
            if ((lc.indexOf("<object") >= 0)) {
                return true;
            }
            if ((lc.indexOf("<embed") >= 0)) {
                return true;
            }
            if ((lc.indexOf("<form") >= 0)) {
                return true;
            }
            if ((lc.indexOf("<iframe") >= 0)) {
                return true;
            }
            if (null != tags && tags.length > 0) {
                for (int i = tags.length; i-- > 0;) {
                    String tag = tags[i];
                    if (Strings.isNotEmpty(tag) && (lc.indexOf(tag) >= 0)) {
                        return true;
                    }
                }
            }
        }

        if ((lc.indexOf("javascript") >= 0)) {
            return true;
        }
        if (eventHandlerProvider != SKIP_EVENT_HANDLER_DETECTION && doContainsEventHandler(lc, (eventHandlerProvider == null ? JS_EVENT_HANDLER_MAP : eventHandlerProvider))) {
            return true;
        }

        return false;
    }

    /**
     * Checks if given lower-case sequence contains a global event handler.
     *
     * @param lc The lower-case sequence to examine
     * @param eventHandlerProvider The event handler provider
     * @return <code>true</code> if sequence contains a global event handler; otherwise <code>false</code>
     */
    public static boolean doContainsEventHandler(String lc, IntFunction<List<String>> eventHandlerProvider) {
        int lcLength = lc.length();
        int pos = lcLength > 2 ? lc.indexOf("on") : -1;
        if (pos >= 0) {
            // First "on" occurrence
            if (pos == 0 || isNonWordCharacter(lc.charAt(pos - 1))) {
                List<String> eventHandlers = pos + 2 < lcLength ? eventHandlerProvider.apply(lc.charAt(pos + 2)) : null;
                if (eventHandlers != null && containsEventHandlerAt(pos, eventHandlers, lc, lcLength)) {
                    return true;
                }
            }

            // Remaining "on" occurrences
            pos = lc.indexOf("on", pos + 1);
            while (pos > 0) {
                if (isNonWordCharacter(lc.charAt(pos - 1))) {
                    List<String> eventHandlers = pos + 2 < lcLength ? eventHandlerProvider.apply(lc.charAt(pos + 2)) : null;
                    if (eventHandlers != null && containsEventHandlerAt(pos, eventHandlers, lc, lcLength)) {
                        return true;
                    }
                }
                pos = lc.indexOf("on", pos + 1);
            }
        }

        // No global event handler identifier detected in lower-case sequence
        return false;
    }

    /**
     * Checks if there is a global event handler identifier at specified index position of given lower-case sequence.
     *
     * @param pos The index position at which substring <code>"on"</code> has been found
     * @param eventHandlers The list of matching event handler identifiers
     * @param lc The lower-case sequence
     * @param lcLength The length of the lower-case sequence
     * @return <code>true</code> if there is a global event handler identifier at specified index position; otherwise <code>false</code>
     */
    private static boolean containsEventHandlerAt(int pos, List<String> eventHandlers, String lc, int lcLength) {
        for (String name : eventHandlers) {
            if (lc.regionMatches(false, pos, name, 0, name.length())) {
                int end = pos + name.length();
                if ((end == lcLength) || (end < lcLength && isNonWordCharacter(lc.charAt(end)))) {
                    // Ends with or contains global event handler
                    return true;
                }
            }
        }
        return false;
    }

    /**
     * Checks if given string contains specified HTML tag.
     *
     * @param sequence The string to check
     * @param tag The HTML tag; e.g. <code>"body"</code>
     * @return <code>true</code> if given String contains specified HTML tag; otherwise <code>false</code>
     */
    public static boolean containsHTMLTag(final String sequence, final String tag) {
        if (sequence == null) {
            throw new IllegalArgumentException("Sequence must not be null");
        }
        if (tag == null) {
            throw new IllegalArgumentException("Tag must not be null");
        }
        return containsIgnoreCase(sequence, tag.startsWith("<") ? tag : new StringBuilder(tag.length() + 2).append('<').append(tag).append('>').toString());
    }

    /**
     * Checks if given string contains specified string.
     *
     * @param sequence The string to check
     * @param str The string
     * @return <code>true</code> if given String contains specified string; otherwise <code>false</code>
     */
    private static boolean containsIgnoreCase(final String sequence, final String str) {
        return (toLowerCase(sequence).indexOf(toLowerCase(str)) >= 0);
    }

    // ----------------------------------------------------------------------------------------- //

    /**
     * Checks if given byte sequence contains common HTML tags.
     *
     * @param in The byte stream to check
     * @param strict <code>true</code> for strict checking; otherwise <code>false</code>
     * @return <code>true</code> if given byte sequence contains common HTML tags; otherwise <code>false</code>
     * @throws IOException If reading from stream fails
     */
    public static boolean containsHTMLTags(InputStream in, boolean strict) throws IOException {
        return containsHTMLTags(in, strict, JS_EVENT_HANDLER_MAP);
    }

    /**
     * Checks if given byte sequence contains common HTML tags.
     *
     * @param in The byte stream to check
     * @param strict <code>true</code> for strict checking; otherwise <code>false</code>
     * @param eventHandlerProvider The optional event handler provider to consider
     * @return <code>true</code> if given byte sequence contains common HTML tags; otherwise <code>false</code>
     * @throws IOException If reading from stream fails
     * @see #SKIP_EVENT_HANDLER_DETECTION
     */
    public static boolean containsHTMLTags(InputStream in, boolean strict, IntFunction<List<String>> eventHandlerProvider) throws IOException {
        return containsHTMLTags(in, strict, false, eventHandlerProvider);
    }

    /**
     * Checks if given byte sequence contains common HTML tags.
     *
     * @param in The byte stream to check
     * @param strict <code>true</code> for strict checking; otherwise <code>false</code>
     * @param oneShot <code>true</code> to only examine the first 8K chunk read from stream; otherwise <code>false</code> for full examination
     * @return <code>true</code> if given byte sequence contains common HTML tags; otherwise <code>false</code>
     * @throws IOException If reading from stream fails
     */
    public static boolean containsHTMLTags(InputStream in, boolean strict, boolean oneShot) throws IOException {
        return containsHTMLTags(in, strict, oneShot, JS_EVENT_HANDLER_MAP);
    }

    /**
     * Checks if given byte sequence contains common HTML tags.
     *
     * @param in The byte stream to check
     * @param strict <code>true</code> for strict checking; otherwise <code>false</code>
     * @param oneShot <code>true</code> to only examine the first 8K chunk read from stream; otherwise <code>false</code> for full examination
     * @param eventHandlerProvider The optional event handler provider to consider
     * @return <code>true</code> if given byte sequence contains common HTML tags; otherwise <code>false</code>
     * @throws IOException If reading from stream fails
     * @see #SKIP_EVENT_HANDLER_DETECTION
     */
    public static boolean containsHTMLTags(InputStream in, boolean strict, boolean oneShot, IntFunction<List<String>> eventHandlerProvider) throws IOException {
        if (null == in) {
            return false;
        }
        try {
            final int buflen = Buffers.BUFFER_SIZE_8K;
            byte[] buf = new byte[buflen];

            int read = in.read(buf, 0, buflen);
            if (read <= 0) {
                return false;
            }

            boolean found = containsHTMLTags(Charsets.toAsciiString(buf, 0, read), strict, eventHandlerProvider);
            if (oneShot || found) {
                return found;
            }

            int overlap = 1024;
            byte[] tail = new byte[overlap];
            int taillen = (overlap <= read) ? overlap : read;
            System.arraycopy(buf, read - taillen, tail, 0, taillen);

            byte[] toExamine = null;
            while (!found && (read = in.read(buf, 0, buflen)) > 0) {
                if (toExamine == null) {
                    toExamine = new byte[buflen + overlap];
                }
                System.arraycopy(tail, 0, toExamine, 0, taillen);
                System.arraycopy(buf, 0, toExamine, taillen, read);
                found = containsHTMLTags(Charsets.toAsciiString(toExamine, 0, read + taillen), strict, eventHandlerProvider);
                if (!found) {
                    taillen = (overlap <= read) ? overlap : read;
                    System.arraycopy(buf, read - taillen, tail, 0, taillen);
                }
            }
            return found;
        } finally {
            Streams.close(in);
        }
    }

    /**
     * Checks if given byte sequence contains common HTML tags.
     *
     * @param sequence The byte sequence to check
     * @return <code>true</code> if given byte sequence contains common HTML tags; otherwise <code>false</code>
     */
    public static boolean containsHTMLTags(final byte[] sequence) {
        if (sequence == null) {
            throw new IllegalArgumentException("Sequence must not be null");
        }
        return containsHTMLTags(Charsets.toAsciiString(sequence));
    }

    /**
     * Checks if given byte sequence contains common HTML tags.
     *
     * @param sequence The byte sequence to check
     * @param strict <code>true</code> for strict checking; otherwise <code>false</code>
     * @return <code>true</code> if given byte sequence contains common HTML tags; otherwise <code>false</code>
     */
    public static boolean containsHTMLTags(final byte[] sequence, final boolean strict) {
        return containsHTMLTags(sequence, strict, JS_EVENT_HANDLER_MAP);
    }

    /**
     * Checks if given byte sequence contains common HTML tags.
     *
     * @param sequence The byte sequence to check
     * @param strict <code>true</code> for strict checking; otherwise <code>false</code>
     * @return <code>true</code> if given byte sequence contains common HTML tags; otherwise <code>false</code>
     * @param eventHandlerProvider The optional event handler provider to consider
     */
    public static boolean containsHTMLTags(final byte[] sequence, final boolean strict, IntFunction<List<String>> eventHandlerProvider) {
        if (sequence == null) {
            throw new IllegalArgumentException("Sequence must not be null");
        }
        return containsHTMLTags(Charsets.toAsciiString(sequence), strict, eventHandlerProvider);
    }

    /**
     * Checks if given byte sequence contains common HTML tags.
     *
     * @param sequence The byte sequence to check
     * @param tags Additional tags to look for
     * @return <code>true</code> if given byte sequence contains common HTML tags; otherwise <code>false</code>
     */
    public static boolean containsHTMLTags(final byte[] sequence, final String... tags) {
        if (sequence == null) {
            throw new IllegalArgumentException("Sequence must not be null");
        }
        return containsHTMLTags(Charsets.toAsciiString(sequence), tags);
    }

    /**
     * Checks if given byte sequence contains common HTML tags.
     *
     * @param sequence The byte sequence to check
     * @param off The offset within byte array
     * @param len The length of valid bytes starting from offset
     * @param tags Additional tags to look for
     * @return <code>true</code> if given byte sequence contains common HTML tags; otherwise <code>false</code>
     */
    public static boolean containsHTMLTags(final byte[] sequence, final int off, final int len, final String... tags) {
        if (sequence == null) {
            throw new IllegalArgumentException("Sequence must not be null");
        }
        if (off < 0 || len < 0 || len > sequence.length - off) {
            throw new IndexOutOfBoundsException();
        }
        return containsHTMLTags(Charsets.toAsciiString(sequence, off, len), tags);
    }

    /**
     * Checks if given byte sequence contains common HTML tags.
     *
     * @param sequence The byte sequence to check
     * @param off The offset within byte array
     * @param len The length of valid bytes starting from offset
     * @return <code>true</code> if given byte sequence contains common HTML tags; otherwise <code>false</code>
     */
    public static boolean containsHTMLTags(final byte[] sequence, final int off, final int len) {
        if (sequence == null) {
            throw new IllegalArgumentException("Sequence must not be null");
        }
        if (off < 0 || len < 0 || len > sequence.length - off) {
            throw new IndexOutOfBoundsException();
        }
        return containsHTMLTags(Charsets.toAsciiString(sequence, off, len));
    }

    /**
     * Checks if given byte sequence contains specified HTML tag.
     *
     * @param sequence The byte sequence to check
     * @param tag The HTML tag; e.g. <code>"body"</code>
     * @return <code>true</code> if given byte sequence contains specified HTML tag; otherwise <code>false</code>
     */
    public static boolean containsHTMLTag(final byte[] sequence, final String tag) {
        if (sequence == null) {
            throw new IllegalArgumentException("Sequence must not be null");
        }
        return containsHTMLTag(Charsets.toAsciiString(sequence), tag);
    }

    /**
     * Checks if given byte sequence contains specified HTML tag.
     *
     * @param sequence The byte sequence to check
     * @param off The offset within byte array
     * @param len The length of valid bytes starting from offset
     * @param tag The HTML tag; e.g. <code>"body"</code>
     * @return <code>true</code> if given byte sequence contains specified HTML tag; otherwise <code>false</code>
     */
    public static boolean containsHTMLTag(final byte[] sequence, final int off, final int len, final String tag) {
        if (sequence == null) {
            throw new IllegalArgumentException("Sequence must not be null");
        }
        if (off < 0 || len < 0 || len > sequence.length - off) {
            throw new IndexOutOfBoundsException();
        }
        return containsHTMLTag(Charsets.toAsciiString(sequence, off, len), tag);
    }

}
