trailofbits · ESultanik · Jan 7, 2022 · Jan 7, 2022 · Jan 7, 2022 · Jan 7, 2022
@@ -5,9 +5,9 @@
 [![Slack Status](https://empireslacking.herokuapp.com/badge.svg)](https://empireslacking.herokuapp.com)
 
 Graphtage is a command-line utility and [underlying library](https://trailofbits.github.io/graphtage/latest/library.html)
-for semantically comparing and merging tree-like structures, such as JSON, XML, HTML, YAML, plist, and CSS files. Its name is a
-portmanteau of “graph” and “graftage”—the latter being the horticultural practice of joining two trees together such
-that they grow as one.
+for semantically comparing and merging tree-like structures, such as JSON, XML, HTML, YAML, plist, CSS files, and
+flame graphs. Its name is a portmanteau of “graph” and “graftage”—the latter being the horticultural practice of joining
+two trees together such that they grow as one.
 
 <p align="center">
   <img src="https://raw.githubusercontent.com/trailofbits/graphtage/master/docs/example.png" title="Graphtage Example">
@@ -85,6 +85,32 @@ By default, Graphtage prints status messages and a progress bar to STDERR. To su
 option. To additionally suppress all but critical log messages, use `--quiet`. Fine-grained control of log messages is
 via the `--log-level` option.
 
+### Specifying File Types
+
+By default, Graphtage makes a best-effort guess of the input file types based upon file extensions and, in some 
+cases, file contents. This is largely based off of the 
+[Python `mimetypes` library](https://docs.python.org/3/library/mimetypes.html#mimetypes.guess_type).
+
+The input files' mimetypes can be explicitly specified using the `--from-mime` and `--to-mime` arguments.
+
+#### Flame Graphs
+
+Graphtage has support for diffing
+[flame graphs](https://trailofbits.github.io/graphtage/latest/graphtage.flamegraph.html).
+This is useful to identify performance regressions between program refactors, _e.g._, when control flow is modified or 
+functions are added, removed, or renamed.
+
+There are many libraries in different languages to produce a flame graph from a profiling run.
+There unfortunately isn't a standardized textual file format to represent flame graphs.
+Graphtage uses this common format:
+```
+function1 #samples
+function1;function2 #samples
+function1;function2;function3 #samples
+```
+In other words, each line of the file is a stack trace represented by a ``;``-delimited list of function names
+followed by a space and the integer number of times that stack trace was sampled in the profiling run.
+
 ## Why does Graphtage exist?
 
 Diffing tree-like structures with unordered elements is tough. Say you want to compare two JSON files.

@@ -7,7 +7,7 @@
 from .version import __version__, VERSION_STRING
 from . import bounds, edits, expressions, fibonacci, formatter, levenshtein, matching, printer, \
                                                                search, sequences, tree, utils
-from . import csv, json, xml, yaml, plist
+from . import csv, json, xml, yaml, plist, flamegraph
 
 import inspect
 

@@ -283,8 +283,12 @@ def printer_type(*pos_args, **kwargs):
         with printer:
             with PathOrStdin(args.FROM_PATH) as from_path:
                 with PathOrStdin(args.TO_PATH) as to_path:
-                    from_format = graphtage.get_filetype(from_path, from_mime)
-                    to_format = graphtage.get_filetype(to_path, to_mime)
+                    try:
+                        from_format = graphtage.get_filetype(from_path, from_mime)
+                        to_format = graphtage.get_filetype(to_path, to_mime)
+                    except ValueError as e:
+                        log.error(str(e))
+                        sys.exit(1)
                     from_tree = from_format.build_tree_handling_errors(from_path, options)
                     if isinstance(from_tree, str):
                         sys.stderr.write(from_tree)

@@ -30,6 +30,7 @@
 from intervaltree import Interval, IntervalTree
 
 from .fibonacci import FibonacciHeap
+from .printer import DEFAULT_PRINTER
 
 
 log = logging.getLogger(__name__)
@@ -197,7 +198,7 @@ def definitive(self) -> bool:
         """
         return self.lower_bound == self.upper_bound and not isinstance(self.lower_bound, Infinity)
 
-    def intersect(self, other) -> 'Range':
+    def intersect(self, other) -> "Range":
         """Intersects this range with another."""
         if not self or not other or self < other or other < self:
             return Range()
@@ -388,51 +389,67 @@ def make_distinct(*bounded: Bounded):
             if not b.bounds().finite:
                 raise ValueError(f"Could not tighten {b!r} to a finite bound")
         tree.add(Interval(b.bounds().lower_bound, b.bounds().upper_bound + 1, b))
-    while len(tree) > 1:
-        # find the biggest interval in the tree
-        biggest: Optional[Interval] = None
-        for m in tree:
-            m_size = m.end - m.begin
-            if biggest is None or m_size > biggest.end - biggest.begin:
-                biggest = m
-        assert biggest is not None
-        if biggest.data.bounds().definitive():
-            # This means that all intervals are points, so we are done!
-            break
-        tree.remove(biggest)
-        matching = tree[biggest.begin:biggest.end]
-        if len(matching) < 1:
-            # This interval does not intersect any others, so it is distinct
-            continue
-        # now find the biggest other interval that intersects with biggest:
-        second_biggest: Optional[Interval] = None
-        for m in matching:
-            m_size = m.end - m.begin
-            if second_biggest is None or m_size > second_biggest.end - second_biggest.begin:
-                second_biggest = m
-        assert second_biggest is not None
-        tree.remove(second_biggest)
-        # Shrink the two biggest intervals until they are distinct
-        while True:
-            biggest_bound: Range = biggest.data.bounds()
-            second_biggest_bound: Range = second_biggest.data.bounds()
-            if (biggest_bound.definitive() and second_biggest_bound.definitive()) or \
-                    biggest_bound.upper_bound < second_biggest_bound.lower_bound or \
-                    second_biggest_bound.upper_bound < biggest_bound.lower_bound:
+    last_tree_len = len(tree) - 1
+    with DEFAULT_PRINTER.tqdm(
+            desc="Making Bounds Distinct", unit=" bounds", leave=False, total=last_tree_len, delay=2.0) as d:
+        while len(tree) > 1:
+            remaining_nodes = len(tree) - 1
+            if remaining_nodes < last_tree_len:
+                d.update(last_tree_len - remaining_nodes)
+            # find the biggest interval in the tree
+            biggest: Optional[Interval] = None
+            for m in tree:
+                m_size = m.end - m.begin
+                if biggest is None or m_size > biggest.end - biggest.begin:
+                    biggest = m
+            assert biggest is not None
+            if biggest.data.bounds().definitive():
+                # This means that all intervals are points, so we are done!
                 break
-            biggest.data.tighten_bounds()
-            second_biggest.data.tighten_bounds()
-        new_interval = Interval(
-            begin=biggest.data.bounds().lower_bound,
-            end=biggest.data.bounds().upper_bound + 1,
-            data=biggest.data
-        )
-        if tree.overlaps(new_interval.begin, new_interval.end):
-            tree.add(new_interval)
-        new_interval = Interval(
-            begin=second_biggest.data.bounds().lower_bound,
-            end=second_biggest.data.bounds().upper_bound + 1,
-            data=second_biggest.data
-        )
-        if tree.overlaps(new_interval.begin, new_interval.end):
-            tree.add(new_interval)
+            tree.remove(biggest)
+            matching = tree[biggest.begin:biggest.end]
+            if len(matching) < 1:
+                # This interval does not intersect any others, so it is distinct
+                continue
+            # now find the biggest other interval that intersects with biggest:
+            second_biggest: Optional[Interval] = None
+            for m in matching:
+                m_size = m.end - m.begin
+                if second_biggest is None or m_size > second_biggest.end - second_biggest.begin:
+                    second_biggest = m
+            assert second_biggest is not None
+            tree.remove(second_biggest)
+            # Shrink the two biggest intervals until they are distinct
+            with DEFAULT_PRINTER.tqdm(desc="Tightening Bounding Intervals", delay=2.0, leave=False, unit=" units") as t:
+                last_overlap: Optional[int] = None
+                while True:
+                    biggest_bound: Range = biggest.data.bounds()
+                    second_biggest_bound: Range = second_biggest.data.bounds()
+                    if (biggest_bound.definitive() and second_biggest_bound.definitive()) or \
+                            biggest_bound.upper_bound < second_biggest_bound.lower_bound or \
+                            second_biggest_bound.upper_bound < biggest_bound.lower_bound:
+                        break
+                    # the ranges still overlap
+                    overlap = min(biggest_bound.upper_bound, second_biggest_bound.upper_bound) - \
+                              max(biggest_bound.lower_bound, second_biggest_bound.lower_bound)
+                    if last_overlap is None:
+                        t.total = overlap
+                    elif overlap < last_overlap:
+                        t.update(last_overlap - overlap)
+                    last_overlap = overlap
+                    biggest.data.tighten_bounds()
+                    second_biggest.data.tighten_bounds()
+            new_interval = Interval(
+                begin=biggest.data.bounds().lower_bound,
+                end=biggest.data.bounds().upper_bound + 1,
+                data=biggest.data
+            )
+            if tree.overlaps(new_interval.begin, new_interval.end):
+                tree.add(new_interval)
+            new_interval = Interval(
+                begin=second_biggest.data.bounds().lower_bound,
+                end=second_biggest.data.bounds().upper_bound + 1,
+                data=second_biggest.data
+            )
+            if tree.overlaps(new_interval.begin, new_interval.end):
+                tree.add(new_interval)