#! /usr/bin/env python3
"""Analyze the graph defined by dependencies in a multiproject
repository, and generate useful reports on them.

This tool is named for its relationship with repocutter swapsvn and
should be useful for preparing a multiproject repository for that
operation.  See also the repocutter swapcheck subcommand.

The big difference between repocutter and this tool is that
repcutter is optimized for speed and to hold down its working
set to one or a handful of adjacent Subversion nodes at a time.
This tool, on the other hand, pulls the entire set of copies
in a stream into memory in order to do random access.

The multiproject repository is expected not to have
trunk/branches/tags structure, but rather to have project
subdirectories which themselves may have trunk/branches/tags
structure.  Each project is a node of the graph, the relation
is copies-to.

A "goal" is a project that has been marked as being
interesting and should be preserved in the repository
conversion.

Assumes "=" never occurs in any pathname and has been used to
substitute for a slash that should be ignored.  The "delimiter"
command can be used to set a different one.

Typical invocations:

reposwapper "read myrepo.see" "expunge"
reposwapper "read myrepo.see" "graph"

Copyright by Eric S. Raymond
SPDX-License-Identifier: BSD-2-Clause

"""
# pylint: disable=line-too-long,invalid-name,missing-function-docstring,no-else-return,no-else-break,no-else-continue,raise-missing-from

# pylint: disable=multiple-imports
import sys, shlex, subprocess

def coalesce(a, b):
    "Attempt to coalesce an endpoint or range at a with a neighbor at b."
    if ":" in a:
        lo, hi = a.split(":")
        lo_rev, lo_node = lo.split(".")
        _, hi_node = hi.split(".")
    else:
        lo_rev, lo_node = a.split(".")
        hi_node = lo_node

    b_rev, b_node = b.split(".")

    # Check for non-adjacency
    if lo_rev != b_rev or int(b_node) != int(hi_node) + 1:
        return None

    return lo_rev + "." + lo_node + ":" + b

def compactify(selection):
    "Produce the shortest possible selection expression for a list of node specs."
    selection.sort()
    i = 0
    while True:
        # Have we merged enough entries that we've run out of list?
        if i >= len(selection) - 1:
            break
        # Nope, try to merge the endpoint or range at i with its right-hand neighbor
        merge = coalesce(selection[i], selection[i+1])

        if merge:
            selection = selection[:i] + [merge] + selection[i+2:]
        else:
            i += 1
    return selection

class Copy:
    "Represent a parsed copy operation from a see file"
    # pylint: disable=too-many-arguments
    def __init__(self, interpreter, revision, target, from_rev, from_path):
        self.interpreter = interpreter
        self.revision = revision
        self.target = target
        self.from_rev = from_rev
        self.from_path = from_path
    def interior(self):
        "Is this a copy within a project tree?"
        return self.interpreter.get_project(self.target) == self.interpreter.get_project(self.from_path)
    def filecopy(self):
        "Is this a file copy?"
        return not self.from_path.endswith("/")
    def sourceloc(self):
        return "%s:%s" % (self.from_rev, self.from_path)
    def __str__(self):
        return "%-5s %s <- %s:%s" % (self.revision, self.target, self.from_rev, self.from_path)

# pylint: disable=too-few-public-methods
class Modification:
    "Represent a parsed file or directory operation from a see file"
    def __init__(self, path, rev, op):
        self.path = path
        self.revision = rev
        self.op = op
    def __str__(self):
        return "%s modified at %s by %s" % (self.path, self.revision, self.op)

# pylint: disable=too-many-instance-attributes
class Analyzer:
    "See file graph analyzer"
    def __init__(self):
        self.multiproject = None
        self.projects = set()
        self.depends_on = set()
        self.sources = set()
        self.goals = set()
        self.supers = set()
        self.inbound = {}
        self.outbound = {}
        self.copies = []
        self.irregular = []
        self.modifications = []
        self.revcount = ""
        self.delimiter = "="

    def pathout(self, s):
        "Transform a path from internal to external form."
        return s.replace(self.delimiter, "/")

    def pathin(self, path):
        "Path to internal form, replacing the slash after a super prefix with delimiter."
        for topseg in self.supers:
            if path.startswith(topseg + "/")  and (len(path) > len(topseg)+1):
                path = path.replace(topseg + "/", topseg + self.delimiter, 1)
        return path

    def get_project(self, path):
        "From a path, get its project directory."
        if self.multiproject:
            for prefix in self.supers:
                if path.startswith(prefix + "/"):
                    return "/".join(path.split("/")[:2])
            return path.split("/")[0]
        else:
            # Return top-level directory, which was the project directory
            # before seapping.  Note thjat super mapping isn't done here.
            fields = path.split("/")
            top = fields.pop(0)
            if top in {"tags", "branches"}:
                fields.pop(0)
            elif top != "trunk":
                raise ValueError
            return fields.pop(0)

    def oddcheck(self, path, op):
        "Predicate: does a path not fulfill standard structure?"
        parts = path.split("/")
        isdir = parts[-1] == ""
        if isdir:
            parts.pop()
        stdlayout = {"trunk", "branches", "tags"}
        if self.multiproject:
            # Check for multiproject layout
            if parts[0] in stdlayout:
                return True
            if len(parts) == 1:
                return (not isdir) or (op not in {"add", "delete"}) # copy and change are interesting
            # If there is no standard structure underneath, that's weird
            if parts[1] not in stdlayout:
                return True
        else:
            # Check for normal layout
            if parts[0] not in stdlayout:
                return True
        # Should be nothing that looks like standard Subversion structure
        # further down.
        for segment in parts[2:]:
            if segment in stdlayout:
                return True
        return False

    def copies_from(self, t, s):
        "Predicate: does project t copy from project s?"
        return (t, s) in self.depends_on

    def copies_to(self, t, s):
        "Predicate: does project t copy to project s?"
        return (s, t) in self.depends_on

    def closure(self, startpoints, relation):
        "Compute transitive closure of a set of project directories under a specified relation"
        closure = startpoints.copy()
        while True:
            addset = set()
            for item in closure:
                for possibility in self.projects:
                    if relation(item, possibility) and possibility not in closure:
                        addset.add(possibility)
            if len(addset) > 0:
                closure = closure.union(addset)
            else:
                break
        return closure

    def get_project_set(self, line):
        "Parse a list of projects from a line, using shell-like quoting if required"
        return {self.pathin(f) for f in shlex.split(line)}

    def dotgraph(self, options):
        "Compute DOT notation for the project-directory graph."
        out = "digraph {\n"
        for project in self.goals:
            out += '\t"%s" [shape=box];\n' % (self.pathout(project),)
        for project in self.projects - self.goals:
            if not project in self.outbound.keys() and "sinks" in options:
                out += '\t"%s" [color=green,style=filled];\n' % (self.pathout(project,))
        for (target, source) in self.depends_on:
            out += '\t"%s" -> "%s";\n' % (self.pathout(source), self.pathout(target))
        out += "}\n"
        return out

    # pylint: disable=too-many-branches,too-many-statements
    def read(self, seefile, limit):
        "Read in a see file"
        for dline in open(seefile):
            if "propset" in dline:
                continue
            fields = dline.split()
            number = fields.pop(0)
            op = fields.pop(0)
            self.revcount = number.split("-")[0]
            if op == "add":
                path = self.pathin(" ".join(fields))
                if self.multiproject is None:
                    self.multiproject = not path.split("/")[0] in {"trunk", "tags", "branches"}
                if self.oddcheck(path, op):
                    self.irregular.append(dline)
                self.modifications.append(Modification(path, number, op))
                project = self.get_project(path)
                if project:
                    if project.endswith("/") and project[:-1] in self.supers:
                        continue
                    self.projects.add(project)
            elif op == "copy":
                try:
                    ind = fields.index("from")
                except ValueError:
                    sys.stderr.write("%s: no from field in '%s'" % (sys.argv[0], dline.strip()))
                    raise SystemExit(1)
                target = self.pathin(" ".join(fields[:ind]))
                if self.oddcheck(target, op):
                    self.irregular.append(dline)
                self.modifications.append(Modification(path, number, op))
                targetdir = self.get_project(target)
                if target.endswith("/") and targetdir[:-1] in self.supers:
                    sys.stderr.write("%s: unexpected copy to super %s at r%s\n" % (sys.argv[0], targetdir, number))
                self.projects.add(targetdir)
                source = " ".join(fields[ind+1:])
                sourcerev = source.split(":")[0]
                source = self.pathin(source.split(":")[1])
                if self.oddcheck(source, op) and self.irregular[-1] != dline:
                    self.irregular.append(dline)
                sourcedir = self.get_project(source)
                self.copies.append(Copy(self, number, target, sourcerev, source))
                if sourcedir != targetdir and sourcedir not in self.supers and targetdir not in self.supers:
                    self.sources.add(sourcedir)
                    self.depends_on.add((targetdir, sourcedir))
                    if targetdir not in self.inbound.keys():
                        self.inbound[targetdir] = set()
                    self.inbound[targetdir].add(sourcedir)
                    if sourcedir not in self.outbound.keys():
                        self.outbound[sourcedir] = set()
                    self.outbound[sourcedir].add(targetdir)
            elif op in {"change", "delete"}:
                path = self.pathin(" ".join(fields))
                if self.oddcheck(path, op):
                    self.irregular.append(dline)
                self.modifications.append(Modification(path, number, op))
            if self.revcount == limit:
                return True
        return False

# Generic analyzer machinery ends here

# pylint: disable=wrong-import-position
import cmd, os.path

# pylint: disable=too-many-public-methods
class Reposwapper(cmd.Cmd, Analyzer):
    "Multiproject repository explorer."
    def __init__(self):
        cmd.Cmd.__init__(self)
        Analyzer.__init__(self)
        self.prompt = "> "

    def display(self, lst, wrapped):
        "Display a list of file paths."
        for item in sorted(lst):
            decorated = self.pathout(item)
            if wrapped:
                decorated = "'^" + decorated + "'"
            sys.stdout.write(" " + decorated)
        sys.stdout.write("\n")

    # pylint: disable=no-self-use
    def help_intro(self):
        print(__doc__)

    # pylint: disable=no-self-use
    def help_read(self):
        sys.stdout.write("""
Read a "repocutter see" report.  If a second argument is given, it sets the
uppermost revision to be analyzed.
""")
    def do_read(self, line):
        cfields = line.split()
        try:
            if self.read(cfields[0], cfields[1] if len(cfields) > 1 else "infinity"):
                print("Stopped")
        except FileNotFoundError:
            print("No such file, bailing out!")
            raise SystemExit(1)

    # pylint: disable=no-self-use
    def help_supers(self):
        sys.stdout.write("""
Declare the named directories to be supers - each *subdirectory* of them
is treated as a project for dependency-tracking purposes. Must be done before
read to be effective.
""")
    def do_supers(self, line):
        self.supers = self.supers.union(self.get_project_set(line))
        return False

    def do_goals(self, line):
        "Designate a set of projects as goals to be preserved in the conversion."
        self.goals = self.goals.union(self.get_project_set(line))
        return False

    def do_list(self, line):
        "List the contents of a property. With no argument, list defined properties."
        if line == "":
            self.display(["sources", "projects", "supers", "goals"], False)
        elif line == "sources":
            self.display(self.sources, False)
        elif line == "projects":
            self.display(self.projects, False)
        elif line == "supers":
            self.display(self.supers, False)
        elif line == "goals":
            self.display(self.goals, False)
        else:
            print("No such property.")
        return False

    def do_upstream(self, line):
        "Display the copies-from closure of a set of project directories."
        print({self.pathout(p) for p in self.closure(self.get_project_set(line), self.copies_from)})
        return False

    def do_downstream(self, line):
        "Display the copies-to closure of a set of project directories."
        print({self.pathout(p) for p in self.closure(self.get_project_set(line), self.copies_to)})
        return False

    # pylint: disable=invalid-name
    def do_EOF(self, _unused):
        "Terminate the interpreter."
        sys.stdout.write("\n")
        return True

    # pylint: disable=no-self-use
    def help_expunge(self):
        sys.stdout.write("""
Generates a repocutter expunge command to strip out all project directories
the goals do not depend on.  The result is the complement relative to the
list of projects of the transitive completion of the depends-on relationship.
""")
    def do_expunge(self, line):
        if not self.depends_on:
            print("No dependency list")
            return False
        dependencies = self.closure(self.get_project_set(line), self.copies_from)
        sys.stdout.write('repocutter expunge \\\n')
        self.display(self.projects - dependencies, True)
        return False

    def help_sift(self):
        sys.stdout.write("""
Generates a repocutter sift command to select all project directories
the goals depend on.  The result is the the transitive completion of the
depends-on relationship.
""")
    def do_sift(self, line):
        if not self.depends_on:
            print("No dependency list")
            return False
        dependencies = self.closure(self.get_project_set(line), self.copies_from)
        sys.stdout.write('repocutter sift \\\n')
        self.display(dependencies, False)
        return False

    def do_terminals(self, _line):
        "List (non-endpoint) projects with no dependents."
        terminals = set()
        for project in self.projects:
            if project not in self.goals:
                for (_, source) in self.depends_on:
                    if source == project:
                        break
                else:
                    terminals.add(self.pathout(project))
        self.display(terminals, False)

    def do_swapcheck(self, _line):
        "Display all operation with paths that won't swap to a canonical form."
        for line in self.irregular:
            sys.stdout.write(line)

    # pylint: disable=no-self-use
    def help_sever(self):
        sys.stdout.write("""
Compute a selection set that includes the locations of all file copies from a specified branch,
and for each one the most recent revision where the copy source can be found.  Emit a repocutter
filecopy command to resolve the outbound file copies.
""")
    def do_sever(self, line):
        sever_set = set()
        for copy in self.copies:
            # Not trying to resolve directory copies, that is seriously hard.
            # We're only resolving exterior file copies here
            if not (copy.filecopy() and not copy.interior()):
                continue
            # All filecopies, or filecopies with a specified from-path prefix
            if (not line or copy.from_path.startswith(line + "/")):
                sever_set.add(copy.revision)
                sourceind = -1
                for i in range(len(self.modifications)):
                    here = self.modifications[len(self.modifications) - i - 1]
                    if sourceind == -1 and float(here.revision) <= float(copy.from_rev):
                        sourceind = i
                    if sourceind != -1 and here.path == copy.from_path:
                        # Aha, found the latest possible source
                        if here.op in {"add", "change"}:
                            sever_set.add(str(here.revision))
                            break
                        else:
                            sys.stderr.write("Unexpected copy or delete while looking for source of %s\n" % copy)
                            return
                else:
                    sys.stderr.write("Unexpected failure while looking for source of %s\n" % copy.sourceloc())
                    return
        print("repocutter -r", ",".join(compactify(list(sever_set))), "filecopy")

    def help_graph(self):
        sys.stdout.write("""
Display the project dependency graph. The arrow relationship is "is copied to".

Goal projects are shown as boxes rather than ovals.

Use the modifier "sinks" to highligh projects with no outbound links in green.

Requires graphviz and imagemagick to be installed.
""")
    def do_graph(self, line):
        with subprocess.Popen("dot -Tpng | display", shell=True, stdin=subprocess.PIPE) as proc:
            proc.stdin.write(self.dotgraph(line).encode("ascii"))

    def do_dump(self, _line):
        "Dump the DOT notation for the dependency graph."
        # pylint: disable=no-value-for-parameter
        print(self.dotgraph())

    def do_stats(self, _unused):
        "Print statistics on the DAG."
        print("%s revisions" % self.revcount)
        print("%d dependencies" % len(self.depends_on))
        print("%d projects" % len(self.projects))
        print("%d goals" % len(self.goals))
        print("%d modification" % len(self.modifications))
        print("%d copies" % len(self.goals))
        return False

    def do_delimiter(self, line):
        "Set the delimiter used for supers."
        self.delimiter = line

    # pylint: disable=no-self-use
    def help_copies(self):
        sys.stdout.write("""
Display cross-project copies involving the specified prefix."

Usage is 'copies [all] [to|from] [PREFIX]'.

The keyword "all" enables display of copies within projects.

With no prefix, display all matching copies
""")
    def do_copies(self, line):
        show_from = True
        show_to = True
        show_interior = False
        if line.startswith("all"):
            show_interior = True
        if line.startswith("from"):
            show_to = False
            line = line[4:].strip()
        elif line.startswith("to"):
            show_from = False
            line = line[2:].strip()
        for copy in self.copies:
            if ((copy.target.startswith(line) and show_to) or (copy.from_path.startswith(line)) and show_from) and not (copy.interior() or show_interior):
                print(copy)

    # pylint: disable=no-self-use
    def help_script(self):
        sys.stdout.write("""
Read and execute a file of reposwapper commands.

This can be useful for setting goals and supers.
""")
    def do_script(self, line):
        if os.path.exists(line):
            for sline in open(line):
                if sline.startswith("#"):
                    continue
                else:
                    self.onecmd(self.precmd(sline.strip()))

    def do_timeline(self, _line):
        "Report a timeline of project creations."
        for mod in self.modifications:
            parts = mod.path.split("/")
            if mod.op == "add" and len(parts) == 3 and parts[0] == "trunk" and parts[1] in self.projects and parts[2] == "":
                print("%s: %s" % (parts[1], mod.revision))

    def cleanup(self):
        pass

def main():
    try:
        interpreter = Reposwapper()
        if not sys.argv[1:]:
            sys.argv.append("-")
        try:
            for arg in sys.argv[1:]:
                for acmd in arg.split(";"):
                    if acmd == '-':
                        interpreter.cmdloop()
                    else:
                        interpreter.onecmd(interpreter.precmd(acmd))
        finally:
            interpreter.cleanup()
    except KeyboardInterrupt:
        sys.stdout.write("\n")

if __name__ == '__main__':
    main()
