#!/usr/bin/python3
# -*- coding: utf-8 -*-

# Copyright © 2008-2010, David Paleino <d.paleino@gmail.com>
#           © 2001-2008, Tommi Virtanen <tv@debian.org>
#           © 1998-2000, Lars Wirzenius <liw@iki.fi>
#
#      This program is free software; you can redistribute it and/or modify
#      it under the terms of the GNU General Public License as published by
#      the Free Software Foundation; either version 3 of the License, or
#      (at your option) any later version.
#
#      This program is distributed in the hope that it will be useful,
#      but WITHOUT ANY WARRANTY; without even the implied warranty of
#      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#      GNU General Public License for more details.
#
#      You should have received a copy of the GNU General Public License
#      along with this program; if not, write to the Free Software
#      Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#      MA 02110-1301, USA.

"""Summarize the contents of a syslog log file.

The syslog(3) service writes system log messages in a certain format:

	Jan 17 19:21:50 zeus kernel: klogd 1.3-3, log source = /proc/kmsg started.

This program summarizes the contents of such a file, by displaying each
unique (except for the time) line once, and also the number of times such
a line occurs in the input. The lines are displayed in the order they occur
in the input.

Lars Wirzenius <liw@iki.fi>
Tommi Virtanen <tv@debian.org>
David Paleino <d.paleino@gmail.com>"""

import string
import getopt
import re
import sys
from optparse import OptionParser
from hashlib import sha1
from gzip import open as gzopen
version = "1.14"


datepats = [
    re.compile(
        r"^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) [ 0-9][0-9] [ 0-9][0-9]:[0-9][0-9]:[0-9][0-9] "),
    re.compile(
        r"^(Mon|Tue|Wed|Thu|Fri|Sat|Sun) (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) [ 0-9][0-9][0-9][0-9]:[0-9][0-9] "),
    re.compile(
        r"^(Mon|Tue|Wed|Thu|Fri|Sat|Sun) (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) [ 0-9][0-9][0-9][0-9]:[0-9][0-9]:[0-9][0-9] "),
    re.compile(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:[.0-9]+\+\d{2}:\d{2} "),
    re.compile(r"^\[ *[0-9]+\.[0-9]+\] "),
]
pidpat = re.compile(r"^([^ ]* ?[^ ]*)\[[0-9][0-9]*\]: ")
repeatpat = re.compile(r"^[^ ]* last message repeated (\d+) times$")

ignore_pats = []


def io_error(err, filename, die=True):
    """Prints a nice error message, i.e. Tracebacks are ugly to end users"""
    import os
    import errno
    import traceback
    num = err.errno
    # DEBUG && die ensures that if it's a non-fatal exception, we don't
    # show all the traceback mess...
    if DEBUG:
        if die:
            traceback.print_exc(file=sys.stderr)
        else:
            print("[E] %s [%s(%s) - %s]" %
                  (os.strerror(num), errno.errorcode[num], num, filename))

    if die:
        sys.exit(1)


def read_patterns(filename):
    """Reads patterns to ignore from file specified by -i | --ignore="""
    pats = []
    try:
        f = open(filename, "r")
    except IOError as e:
        io_error(e, filename, False)
        return []
    for line in f:
        rule = line.strip()
        if rule[0:1] == "#":
            continue
        else:
            pats.append(re.compile(rule))
    f.close()
    return pats


def read_states(filename):
    """Reads the previous state saved into the argument of -s | --state="""
    states = {}
    if not filename:
        return states
    try:
        f = open(filename, "r")
    except IOError as e:
        io_error(e, filename, False)
        return states
    for line in f:
        fields = string.split(line)
        states[fields[0]] = (string.atoi(fields[1]), fields[2])
    f.close()
    return states


def save_states(filename, states):
    if not filename:
        return
    try:
        f = open(filename, "w")
    except IOError as e:
        io_error(e, filename, True)
    for filename in list(states.keys()):
        value = states[filename]
        f.write("%s %d %s\n" % (filename, value[0], value[1]))
    f.close()


def should_be_ignored(line):
    for pat in ignore_pats:
        if pat.search(line):
            return 1
    return 0


def split_date(line):
    for pat in datepats:
        m = pat.search(line)
        if m:
            return line[:m.end()], line[m.end():]
    print("line has bad date", "<" + line.rstrip() + ">")
    return None, line


def is_gzipped(filename):
    """Returns True if the filename is a gzipped compressed file"""
    try:
        import magic
        ms = magic.open(magic.MAGIC_NONE)
        ms.load()
        if re.search("^gzip compressed data.*", ms.file(filename)):
            return True
        else:
            return False
    except:
        from os.path import splitext

        if not QUIET:
            print(
                "Using fallback detection... please install python-magic for better gzip detection.")

        if splitext(filename)[1] == ".gz":
            return True
        else:
            return False


def summarize(filename, states):
    counts = {}
    order = []
    ignored_count = 0
    if not QUIET:
        print("Summarizing %s" % filename)

    # If the file is a gzipped log, open it
    # using the proper function from the gzip
    # module.
    try:
        if is_gzipped(filename):
            file = gzopen(filename, "rb")
        else:
            file = open(filename, "r")
    except IOError as e:
        io_error(e, filename, True)

    linecount = 0

    shaobj = sha1()
    if filename in states:
        oldlines, oldsha = states[filename]
        for i in range(oldlines):
            line = file.readline()
            shaobj.update(line.encode('UTF8'))
#		print "OLD-new: %s" % shaobj.hexdigest()
#		print "OLD-file: %s" % oldsha
        if shaobj.hexdigest() != oldsha:
            #file.seek(0, 0)
            file.seek(0)
            shaobj = sha1()
        else:
            linecount = oldlines
    if not QUIET:
        print("%8d Lines skipped (already processed)" % linecount)

    line = file.readline()
    previous = None
#	print "BEFORE-while: %s" % shaobj.hexdigest()
    foo = 0
    while line:
        #		foo+=1
        shaobj.update(line.encode('UTF8'))
        linecount += 1

        if should_be_ignored(line):
            ignored_count += 1
            if DEBUG:
                print("Ignoring: %s" % line)
            line = file.readline()
            continue

        if not line.strip():
            continue

        date, rest = split_date(line)
        if date:
            found = pidpat.search(rest)
            if found:
                rest = found.group(1) + ": " + rest[found.end():]

        count = 1
        repeated = None
        if REPEAT:
            repeated = repeatpat.search(rest)
        if repeated and previous:
            count = int(repeated.group(1))
            rest = previous

        if rest in counts:
            counts[rest] = counts[rest] + count
        else:
            assert count == 1
            counts[rest] = count
            order.append(rest)

        if not repeated:
            previous = rest
        line = file.readline()
    file.close()

#	print "TOT-lines: %d" % linecount
#	print "TOT-ignor: %d" % ignored_count
#	print "AFTER-while: %s" % shaobj.hexdigest()
#	print foo
    states[filename] = (linecount + ignored_count, shaobj.hexdigest())
#	print states

    if QUIET and order:
        print("Summarizing %s" % filename)
    if not QUIET or order:
        print("%8d Patterns to ignore" % len(ignore_pats))
        print("%8d Ignored lines" % ignored_count)
    for rest in order:
        print("%8d %s" % (counts[rest], rest), end=' ')
    if not QUIET or order:
        print()


def main():
    global ignore_pats, IGNORE_FILENAME, STATE_FILENAME, REPEAT, QUIET, DEBUG

    parser = OptionParser(usage="%prog [options] <logfile> [<logfile> ...]",
                          version="%%prog %s" % version,
                          description="Summarize the contents of a syslog log file")
    parser.add_option("-i", "--ignore", dest="ignorefile", default="/etc/syslog-summary/ignore.rules",
                      help="read regular expressions from <file>, and ignore lines in the <logfile> that match them",
                      metavar="<file>")
    parser.add_option("-s", "--state", dest="statefile",
                      help="read state information from <file> (see the man page)",
                      metavar="<file>")
    parser.add_option("-r", "--repeat", action="store_true", dest="repeat", default=False,
                      help="merge \"last message repeated x times\" with the event repeated")
    parser.add_option("-q", "--quiet", action="store_true", dest="quiet", default=False,
                      help="don't output anything, unless there were unmatched lines")
    parser.add_option("-d", "--debug", action="store_true", dest="debug", default=False,
                      help="shows additional messages in case of error")

    (options, args) = parser.parse_args()

    if len(sys.argv) == 1:
        parser.error("no logfile specified")

    IGNORE_FILENAME = options.ignorefile
    STATE_FILENAME = options.statefile
    REPEAT = options.repeat
    QUIET = options.quiet
    DEBUG = options.debug

    ignore_pats = read_patterns(IGNORE_FILENAME)
    states = read_states(STATE_FILENAME)
    for filename in args:
        summarize(filename, states)
    save_states(STATE_FILENAME, states)


if __name__ == "__main__":
    main()
