1#!/usr/bin/env python
2#
3# Copyright (C) 2022 The Android Open Source Project
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9#      http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17from sys import exit
18from typing import List
19from glob import glob
20from pathlib import Path
21from collections import defaultdict
22from difflib import Differ
23from re import split
24from tqdm import tqdm
25import argparse
26
27
28DIFFER_CODE_LEN = 2
29
30class DifferCodes:
31    COMMON = '  '
32    UNIQUE_FIRST = '- '
33    UNIQUE_SECOND = '+ '
34    DIFF_IDENT = '? '
35
36class FilesDiffAnalyzer:
37    def __init__(self, args) -> None:
38        self.out_dir = args.out_dir
39        self.show_diff = args.show_diff
40        self.skip_words = args.skip_words
41        self.first_dir = args.first_dir
42        self.second_dir = args.second_dir
43        self.include_common = args.include_common
44
45        self.first_dir_files = self.get_files(self.first_dir)
46        self.second_dir_files = self.get_files(self.second_dir)
47        self.common_file_map = defaultdict(set)
48
49        self.map_common_files(self.first_dir_files, self.first_dir)
50        self.map_common_files(self.second_dir_files, self.second_dir)
51
52    def get_files(self, dir: str) -> List[str]:
53        """Get all files directory in the input directory including the files in the subdirectories
54
55        Recursively finds all files in the input directory.
56        Returns a list of file directory strings, which do not include directories but only files.
57        List is sorted in alphabetical order of the file directories.
58
59        Args:
60            dir: Directory to get the files. String.
61
62        Returns:
63            A list of file directory strings within the input directory.
64            Sorted in Alphabetical order.
65
66        Raises:
67            FileNotFoundError: An error occurred accessing the non-existing directory
68        """
69
70        if not dir_exists(dir):
71            raise FileNotFoundError("Directory does not exist")
72
73        if dir[:-2] != "**":
74            if dir[:-1] != "/":
75                dir += "/"
76            dir += "**"
77
78        return [file for file in sorted(glob(dir, recursive=True)) if Path(file).is_file()]
79
80    def map_common_files(self, files: List[str], dir: str) -> None:
81        for file in files:
82            file_name = file.split(dir, 1)[-1]
83            self.common_file_map[file_name].add(dir)
84        return
85
86    def compare_file_contents(self, first_file: str, second_file: str) -> List[str]:
87        """Compare the contents of the files and return different lines
88
89        Given two file directory strings, compare the contents of the two files
90        and return the list of file contents string prepended with unique identifier codes.
91        The identifier codes include:
92        - '  '(two empty space characters): Line common to two files
93        - '- '(minus followed by a space) : Line unique to first file
94        - '+ '(plus followed by a space)  : Line unique to second file
95
96        Args:
97            first_file: First file directory string to compare the content
98            second_file: Second file directory string to compare the content
99
100        Returns:
101            A list of the file content strings. For example:
102
103            [
104                "  Foo",
105                "- Bar",
106                "+ Baz"
107            ]
108        """
109
110        d = Differ()
111        first_file_contents = sort_methods(get_file_contents(first_file))
112        second_file_contents = sort_methods(get_file_contents(second_file))
113        diff = list(d.compare(first_file_contents, second_file_contents))
114        ret = [f"diff {first_file} {second_file}"]
115
116        idx = 0
117        while idx < len(diff):
118            line = diff[idx]
119            line_code = line[:DIFFER_CODE_LEN]
120
121            match line_code:
122                case DifferCodes.COMMON:
123                    if self.include_common:
124                        ret.append(line)
125
126                case DifferCodes.UNIQUE_FIRST:
127                    # Should compare line
128                    if (idx < len(diff) - 1 and
129                        (next_line_code := diff[idx + 1][:DIFFER_CODE_LEN])
130                        not in (DifferCodes.UNIQUE_FIRST, DifferCodes.COMMON)):
131                        delta = 1 if next_line_code == DifferCodes.UNIQUE_SECOND else 2
132                        line_to_compare = diff[idx + delta]
133                        if self.lines_differ(line, line_to_compare):
134                            ret.extend([line, line_to_compare])
135                        else:
136                            if self.include_common:
137                                ret.append(DifferCodes.COMMON +
138                                           line[DIFFER_CODE_LEN:])
139                        idx += delta
140                    else:
141                        ret.append(line)
142
143                case DifferCodes.UNIQUE_SECOND:
144                    ret.append(line)
145
146                case DifferCodes.DIFF_IDENT:
147                    pass
148            idx += 1
149        return ret
150
151    def lines_differ(self, line1: str, line2: str) -> bool:
152        """Check if the input lines are different or not
153
154        Compare the two lines word by word and check if the two lines are different or not.
155        If the different words in the comparing lines are included in skip_words,
156        the lines are not considered different.
157
158        Args:
159            line1:      first line to compare
160            line2:      second line to compare
161
162        Returns:
163            Boolean value indicating if the two lines are different or not
164
165        """
166        # Split by '.' or ' '(whitespace)
167        def split_words(line: str) -> List[str]:
168            return split('\\s|\\.', line[DIFFER_CODE_LEN:])
169
170        line1_words, line2_words = split_words(line1), split_words(line2)
171        if len(line1_words) != len(line2_words):
172            return True
173
174        for word1, word2 in zip(line1_words, line2_words):
175            if word1 != word2:
176                # not check if words are equal to skip word, but
177                # check if words contain skip word as substring
178                if all(sw not in word1 and sw not in word2 for sw in self.skip_words):
179                    return True
180
181        return False
182
183    def analyze(self) -> None:
184        """Analyze file contents in both directories and write to output or console.
185        """
186        for file in tqdm(sorted(self.common_file_map.keys())):
187            val = self.common_file_map[file]
188
189            # When file exists in both directories
190            lines = list()
191            if val == set([self.first_dir, self.second_dir]):
192                lines = self.compare_file_contents(
193                    self.first_dir + file, self.second_dir + file)
194            else:
195                existing_dir, not_existing_dir = (
196                    (self.first_dir, self.second_dir) if self.first_dir in val
197                    else (self.second_dir, self.first_dir))
198
199                lines = [f"{not_existing_dir}{file} does not exist."]
200
201                if self.show_diff:
202                    lines.append(f"Content of {existing_dir}{file}: \n")
203                    lines.extend(get_file_contents(existing_dir + file))
204
205            self.write(lines)
206
207    def write(self, lines: List[str]) -> None:
208        if self.out_dir == "":
209            pprint(lines)
210        else:
211            write_lines(self.out_dir, lines)
212
213###
214# Helper functions
215###
216
217def sort_methods(lines: List[str]) -> List[str]:
218    """Sort class methods in the file contents by alphabetical order
219
220    Given lines of Java file contents, return lines with class methods sorted in alphabetical order.
221    Also omit empty lines or lines with spaces.
222    For example:
223        l = [
224            "package android.test;",
225            "",
226            "public static final int ORANGE = 1;",
227            "",
228            "public class TestClass {",
229            "public TestClass() { throw new RuntimeException("Stub!"); }",
230            "public void foo() { throw new RuntimeException("Stub!"); }",
231            "public void bar() { throw new RuntimeException("Stub!"); }",
232            "}"
233        ]
234        sort_methods(l) returns
235        [
236            "package android.test;",
237            "public static final int ORANGE = 1;",
238            "public class TestClass {",
239            "public TestClass() { throw new RuntimeException("Stub!"); }",
240            "public void bar() { throw new RuntimeException("Stub!"); }",
241            "public void foo() { throw new RuntimeException("Stub!"); }",
242            "}"
243        ]
244
245    Args:
246        lines: List of strings consisted of Java file contents.
247
248    Returns:
249        A list of string with sorted class methods.
250
251    """
252    def is_not_blank(l: str) -> bool:
253        return bool(l) and not l.isspace()
254
255    ret = list()
256
257    in_class = False
258    buffer = list()
259    for line in lines:
260        if not in_class:
261            if "class" in line:
262                in_class = True
263                ret.append(line)
264            else:
265                # Adding static variables, package info, etc.
266                # Skipping empty or space lines.
267                if is_not_blank(line):
268                    ret.append(line)
269        else:
270            # End of class
271            if line and line[0] == "}":
272                in_class = False
273                ret.extend(sorted(buffer))
274                buffer = list()
275                ret.append(line)
276            else:
277                if is_not_blank(line):
278                    buffer.append(line)
279
280    return ret
281
282def get_file_contents(file_path: str) -> List[str]:
283    lines = list()
284    with open(file_path) as f:
285        lines = [line.rstrip('\n') for line in f]
286        f.close()
287    return lines
288
289def pprint(l: List[str]) -> None:
290    for line in l:
291        print(line)
292
293def write_lines(out_dir: str, lines: List[str]) -> None:
294    with open(out_dir, "a") as f:
295        f.writelines(line + '\n' for line in lines)
296        f.write("\n")
297        f.close()
298
299def dir_exists(dir: str) -> bool:
300    return Path(dir).exists()
301
302if __name__ == '__main__':
303    parser = argparse.ArgumentParser()
304    parser.add_argument('first_dir', action='store', type=str,
305                        help="first path to compare file directory and contents")
306    parser.add_argument('second_dir', action='store', type=str,
307                        help="second path to compare file directory and contents")
308    parser.add_argument('--out', dest='out_dir',
309                        action='store', default="", type=str,
310                        help="optional directory to write log. If not set, will print to console")
311    parser.add_argument('--show-diff-file', dest='show_diff',
312                        action=argparse.BooleanOptionalAction,
313                        help="optional flag. If passed, will print out the content of the file unique to each directories")
314    parser.add_argument('--include-common', dest='include_common',
315                        action=argparse.BooleanOptionalAction,
316                        help="optional flag. If passed, will print out the contents common to both files as well,\
317                            instead of printing only diff lines.")
318    parser.add_argument('--skip-words', nargs='+',
319                        dest='skip_words', default=[], help="optional words to skip in comparison")
320
321    args = parser.parse_args()
322
323    if not args.first_dir or not args.second_dir:
324        parser.print_usage()
325        exit(0)
326
327    analyzer = FilesDiffAnalyzer(args)
328    analyzer.analyze()
329