gem5/util/find_copyrights.py

#!/usr/bin/env python3

import os
import re
import sys

from file_types import (
    find_files,
    lang_type,
)

mode_line = re.compile(r"(-\*- *mode:.* *-\*-)")
shell_comment = re.compile(r"^\s*#")
lisp_comment = re.compile(r";")
cpp_comment = re.compile(r"//")
c_comment_start = re.compile(r"/\*")
c_comment_end = re.compile(r"\*/")


def find_copyright_block(lines, lang_type):
    start = None
    if lang_type in ("python", "make", "shell", "perl", "scons"):
        for i, line in enumerate(lines):
            if i == 0 and (line.startswith("#!") or mode_line.search(line)):
                continue

            if shell_comment.search(line):
                if start is None:
                    start = i
            elif start is None:
                if line.strip():
                    return
            else:
                yield start, i - 1
                start = None

    elif lang_type in ("lisp",):
        for i, line in enumerate(lines):
            if i == 0 and mode_line.search(line):
                continue

            if lisp_comment.search(line):
                if start is None:
                    start = i
            elif start is None:
                if line.strip():
                    return
            else:
                yield start, i - 1
                start = None

    elif lang_type in (
        "C",
        "C++",
        "swig",
        "isa",
        "asm",
        "slicc",
        "lex",
        "yacc",
    ):
        mode = None
        for i, line in enumerate(lines):
            if i == 0 and mode_line.search(line):
                continue

            if mode == "C":
                assert start is not None, "on line %d" % (i + 1)
                match = c_comment_end.search(line)
                if match:
                    yield start, i
                    mode = None
                continue

            cpp_match = cpp_comment.search(line)
            c_match = c_comment_start.search(line)

            if cpp_match:
                assert not c_match, "on line %d" % (i + 1)
                if line[: cpp_match.start()].strip():
                    return
                if mode is None:
                    mode = "CPP"
                    start = i
                else:
                    text = line[cpp_match.end() :].lstrip()
                    if text.startswith("Copyright") > 0:
                        yield start, i - 1
                        start = i
                continue
            elif mode == "CPP":
                assert start is not None, "on line %d" % (i + 1)
                if not line.strip():
                    continue
                yield start, i - 1
                mode = None
                if not c_match:
                    return

            if c_match:
                assert mode is None, "on line %d" % (i + 1)
                mode = "C"
                start = i

            if mode is None and line.strip():
                return

    else:
        raise AttributeError(f"Could not handle language {lang_type}")


date_range_re = re.compile(r"([0-9]{4})\s*-\s*([0-9]{4})")


def process_dates(dates):
    dates = [d.strip() for d in dates.split(",")]

    output = set()
    for date in dates:
        match = date_range_re.match(date)
        if match:
            f, l = (int(d) for d in match.groups())
            for i in range(f, l + 1):
                output.add(i)
        else:
            try:
                date = int(date)
                output.add(date)
            except ValueError:
                pass

    return output


copyright_re = re.compile(
    r"Copyright (\([cC]\)) ([-, 0-9]+)[\s*#/]*([A-z-,. ]+)", re.DOTALL
)

authors_re = re.compile(r"^[\s*#/]*Authors:\s*([A-z .]+)\s*$")
more_authors_re = re.compile(r"^[\s*#/]*([A-z .]+)\s*$")

all_owners = set()


def get_data(lang_type, lines):
    data = []
    last = None
    for start, end in find_copyright_block(lines, lang_type):
        joined = "".join(lines[start : end + 1])
        match = copyright_re.search(joined)
        if not match:
            continue

        c, dates, owner = match.groups()
        dates = dates.strip()
        owner = owner.strip()

        all_owners.add(owner)
        try:
            dates = process_dates(dates)
        except Exception:
            print(dates)
            print(owner)
            raise

        authors = []
        for i in range(start, end + 1):
            line = lines[i]
            if not authors:
                match = authors_re.search(line)
                if match:
                    authors.append(match.group(1).strip())
            else:
                match = more_authors_re.search(line)
                if not match:
                    for j in range(i, end + 1):
                        line = lines[j].strip()
                        if not line:
                            end = j
                            break
                        if line.startswith("//"):
                            line = line[2:].lstrip()
                            if line:
                                end = j - 1
                                break
                    break
                authors.append(match.group(1).strip())

        info = (owner, dates, authors, start, end)
        data.append(info)

    return data


def datestr(dates):
    dates = list(dates)
    dates.sort()

    output = []

    def add_output(first, second):
        if first == second:
            output.append("%d" % (first))
        else:
            output.append("%d-%d" % (first, second))

    first = dates.pop(0)
    second = first
    while dates:
        next = dates.pop(0)
        if next == second + 1:
            second = next
        else:
            add_output(first, second)
            first = next
            second = next

    add_output(first, second)

    return ",".join(output)


usage_str = """usage:
%s [-v] <directory>"""


def usage(exitcode):
    print(usage_str % sys.argv[0])
    if exitcode is not None:
        sys.exit(exitcode)


if __name__ == "__main__":
    import getopt

    show_counts = False
    ignore = set()
    verbose = False
    try:
        opts, args = getopt.getopt(sys.argv[1:], "ci:v")
    except getopt.GetoptError:
        usage(1)

    for o, a in opts:
        if o == "-c":
            show_counts = True
        if o == "-i":
            ignore.add(a)
        if o == "-v":
            verbose = True

    files = []

    for base in args:
        if os.path.isfile(base):
            files += [(base, lang_type(base))]
        elif os.path.isdir(base):
            files += find_files(base)
        else:
            raise AttributeError(f"can't access '{base}'")

    copyrights = {}
    counts = {}

    for filename, lang in files:
        f = file(filename, "r")
        lines = f.readlines()
        if not lines:
            continue

        lines = [line.rstrip("\r\n") for line in lines]

        lt = lang_type(filename, lines[0])
        try:
            data = get_data(lt, lines)
        except Exception as e:
            if verbose:
                if len(e.args) == 1:
                    e.args = (f"{e} ({filename}))",)
                print(f"could not parse {filename}: {e}")
            continue

        for owner, dates, authors, start, end in data:
            if owner not in copyrights:
                copyrights[owner] = set()
            if owner not in counts:
                counts[owner] = 0

            copyrights[owner] |= dates
            counts[owner] += 1

    info = [(counts[o], d, o) for o, d in list(copyrights.items())]

    for count, dates, owner in sorted(info, reverse=True):
        if show_counts:
            owner = f"{owner} ({count} files)"
        print(f"Copyright (c) {datestr(dates)} {owner}")