1#!/usr/bin/python3
2#
3# Copyright (C) 2021 The Android Open Source Project
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9#      http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16"""Utilities for comparing two version of a codebase."""
17
18import argparse
19import difflib
20import filecmp
21import os
22import pathlib
23import re
24
25
26class FileStat:
27  """File statistics class for a file."""
28
29  NON_TEXT = 0
30  TEXT = 1
31
32  def __init__(self, file_path):
33    """Initializes with a file path string."""
34    if file_path:
35      self.file_name = str(file_path)
36      self.size = file_path.stat().st_size
37    else:
38      self.file_name = ''
39      self.size = 0
40
41    self.line_cnt = 0
42    self.group_cnt = 0
43    self.add_line_cnt = 0
44    self.remove_line_cnt = 0
45    self.replace_line_cnt = 0
46
47  @staticmethod
48  def get_csv_header(prefix=None):
49    """Returns CSV header string."""
50    cols = ['file', 'size', 'line', 'group', 'add', 'remove', 'replace']
51    if prefix:
52      return ','.join('{0}_{1}'.format(prefix, c) for c in cols)
53    else:
54      return ','.join(c for c in cols)
55
56  def get_csv_str(self, strip_dir_len=0):
57    """Returns the file statistic CSV string."""
58    name = self.file_name[strip_dir_len:]
59    csv = [
60        FileStat.no_comma(name), self.size, self.line_cnt, self.group_cnt,
61        self.add_line_cnt, self.remove_line_cnt, self.replace_line_cnt
62    ]
63    return ','.join(str(i) for i in csv)
64
65  @staticmethod
66  def no_comma(astr):
67    """Replaces , with _."""
68    return astr.replace(',', '_')
69
70
71class DiffStat:
72  """Diff statistic class for 2 versions of a file."""
73
74  SAME = 0
75  NEW = 1
76  REMOVED = 2
77  MODIFIED = 3
78  INCOMPARABLE = 4
79
80  def __init__(self, common_name, old_file_stat, new_file_stat, state):
81    """Initializes with the common names & etc."""
82    self.old_file_stat = old_file_stat
83    self.new_file_stat = new_file_stat
84    self.name = common_name
85    self.ext = os.path.splitext(self.name)[1].lstrip('.')
86    self.state = state
87    self.file_type = FileStat.NON_TEXT
88
89  def add_diff_stat(self, diff_lines):
90    """Adds the statistic by the diff lines."""
91    # These align with https://github.com/python/cpython/blob/3.9/Lib/difflib.py
92    old_pattern = re.compile(r'\*{3} (.*)')
93    new_pattern = re.compile(r'-{3} (.*)')
94    group_separator = '***************'
95    old_group_header = re.compile(r'\*{3} (\d*),(\d*) \*{4}')
96    new_group_header = re.compile(r'-{3} (\d*),(\d*) -{4}')
97
98    # section 0 is old verion & 1 is new verion
99    section = -1
100    diff_stats = [self.old_file_stat, self.new_file_stat]
101    in_group = False
102
103    h1m = old_pattern.match(diff_lines[0])
104    if not h1m:
105      print('ERROR: wrong diff header line 1: %s' % diff_lines[0])
106      return
107
108    h2m = new_pattern.match(diff_lines[1])
109    if not h2m:
110      print('ERROR: wrong diff header line 2: %s' % diff_lines[1])
111      return
112
113    for line in diff_lines[2:]:
114      if in_group:
115        if line.startswith('  '):
116          # equal
117          continue
118        elif line.startswith('! '):
119          # replace
120          diff_stats[section].replace_line_cnt += 1
121          continue
122        elif line.startswith('+ '):
123          # add
124          diff_stats[section].add_line_cnt += 1
125          continue
126        elif line.startswith('- '):
127          # removed
128          diff_stats[section].remove_line_cnt += 1
129          continue
130
131      oghm = old_group_header.match(line)
132      if oghm:
133        section = 0
134        diff_stats[section].group_cnt += 1
135        continue
136
137      nghm = new_group_header.match(line)
138      if nghm:
139        section = 1
140        diff_stats[section].group_cnt += 1
141        continue
142
143      if line.startswith(group_separator):
144        in_group = True
145        continue
146
147
148class ChangeReport:
149  """Change report class for the diff statistics on 2 versions of a codebase.
150
151  Attributes:
152    old_dir: The old codebase dir path string.
153    new_dir: The new codebase dir path string.
154    dircmp: The dircmp object
155    group_cnt: How many diff groups.
156    add_line_cnt: How many lines are added.
157    remove_line_cnt: How many lines are removed.
158    replace_line_cnt: Hoe many lines are changed.
159  """
160
161  def __init__(self, old_dir, new_dir, ignores=None, state_filter=None):
162    """Initializes with old & new dir path strings."""
163    self.old_dir = os.path.abspath(old_dir)
164    self._old_dir_prefix_len = len(self.old_dir) + 1
165    self.new_dir = os.path.abspath(new_dir)
166    self._new_dir_prefix_len = len(self.new_dir) + 1
167    if ignores:
168      self._ignores = ignores.split(',')
169      self._ignores.extend(filecmp.DEFAULT_IGNORES)
170    else:
171      self._ignores = filecmp.DEFAULT_IGNORES
172
173    if state_filter:
174      self._state_filter = list(map(int, state_filter.split(',')))
175    else:
176      self._state_filter = [0, 1, 2, 3, 4]
177
178    self._do_same = DiffStat.SAME in self._state_filter
179    self._do_new = DiffStat.NEW in self._state_filter
180    self._do_removed = DiffStat.REMOVED in self._state_filter
181    self._do_moeified = DiffStat.MODIFIED in self._state_filter
182    self._do_incomparable = DiffStat.INCOMPARABLE in self._state_filter
183
184    self.dircmp = filecmp.dircmp(
185        self.old_dir, self.new_dir, ignore=self._ignores)
186    self._diff_stats = []
187    self._diff_stat_lines = []
188    self._diff_lines = []
189    self._processed_cnt = 0
190    self._common_dir_len = ChangeReport.get_common_path_len(
191        self.old_dir, self.new_dir)
192
193  @staticmethod
194  def get_common_path_len(dir1, dir2):
195    """Gets the length of the common path of old & new folders."""
196    sep = os.path.sep
197    last_sep_pos = 0
198    for i in range(len(dir1)):
199      if dir1[i] == sep:
200        last_sep_pos = i
201      if dir1[i] != dir2[i]:
202        break
203    return last_sep_pos + 1
204
205  @staticmethod
206  def get_diff_stat_header():
207    """Gets the diff statistic CSV header."""
208    return 'file,ext,text,state,{0},{1}\n'.format(
209        FileStat.get_csv_header('new'), FileStat.get_csv_header('old'))
210
211  def get_diff_stat_lines(self):
212    """Gets the diff statistic CSV lines."""
213    if self._processed_cnt < 1:
214      self._process_dircmp(self.dircmp)
215      self._processed_cnt += 1
216
217      self._diff_stat_lines = []
218      for diff_stat in self._diff_stats:
219        self._diff_stat_lines.append('{0},{1},{2},{3},{4},{5}\n'.format(
220            FileStat.no_comma(diff_stat.name), diff_stat.ext,
221            diff_stat.file_type, diff_stat.state,
222            diff_stat.new_file_stat.get_csv_str(self._common_dir_len),
223            diff_stat.old_file_stat.get_csv_str(self._common_dir_len)))
224
225    return self._diff_stat_lines
226
227  def get_diff_lines(self):
228    """Gets the diff output lines."""
229    if self._processed_cnt < 1:
230      self._process_dircmp(self.dircmp)
231      self._processed_cnt += 1
232    return self._diff_lines
233
234  def _process_dircmp(self, dircmp):
235    """Compare all files in a dircmp object for diff statstics & output."""
236    if self._do_moeified:
237      self._process_diff_files(dircmp)
238
239    for subdir_dircmp in dircmp.subdirs.values():
240      rp = pathlib.Path(subdir_dircmp.right)
241      lp = pathlib.Path(subdir_dircmp.left)
242      if rp.is_symlink() or lp.is_symlink():
243        print('SKIP: symlink: {0} or {1}'.format(subdir_dircmp.right,
244                                                 subdir_dircmp.left))
245        continue
246      self._process_dircmp(subdir_dircmp)
247
248    if self._do_new:
249      self._process_others(dircmp.right_only, dircmp.right,
250                           self._new_dir_prefix_len, DiffStat.NEW)
251    if self._do_same:
252      self._process_others(dircmp.same_files, dircmp.right,
253                           self._new_dir_prefix_len, DiffStat.SAME)
254    if self._do_incomparable:
255      self._process_others(dircmp.funny_files, dircmp.right,
256                           self._new_dir_prefix_len, DiffStat.INCOMPARABLE)
257    if self._do_removed:
258      self._process_others(dircmp.left_only, dircmp.left,
259                           self._old_dir_prefix_len, DiffStat.REMOVED)
260
261  def _process_others(self, files, adir, prefix_len, state):
262    """Processes files are not modified."""
263    empty_stat = FileStat(None)
264    for file in files:
265      file_path = pathlib.Path(adir, file)
266      if file_path.is_symlink():
267        print('SKIP: symlink: {0}, {1}'.format(state, file_path))
268        continue
269      elif file_path.is_dir():
270        flist = self._get_filtered_files(file_path)
271        self._process_others(flist, adir, prefix_len, state)
272      else:
273        file_stat = FileStat(file_path)
274        common_name = str(file_path)[prefix_len:]
275        if state == DiffStat.REMOVED:
276          diff_stat = DiffStat(common_name, file_stat, empty_stat, state)
277        else:
278          diff_stat = DiffStat(common_name, empty_stat, file_stat, state)
279        try:
280          with open(file_path, encoding='utf-8') as f:
281            lines = f.readlines()
282          file_stat.line_cnt = len(lines)
283          file_type = FileStat.TEXT
284        except UnicodeDecodeError:
285          file_type = FileStat.NON_TEXT
286
287        diff_stat.file_type = file_type
288        self._diff_stats.append(diff_stat)
289
290  def _process_diff_files(self, dircmp):
291    """Processes files are modified."""
292    for file in dircmp.diff_files:
293      old_file_path = pathlib.Path(dircmp.left, file)
294      new_file_path = pathlib.Path(dircmp.right, file)
295      self._diff_files(old_file_path, new_file_path)
296
297  def _diff_files(self, old_file_path, new_file_path):
298    """Diff old & new files."""
299    old_file_stat = FileStat(old_file_path)
300    new_file_stat = FileStat(new_file_path)
301    common_name = str(new_file_path)[self._new_dir_prefix_len:]
302    diff_stat = DiffStat(common_name, old_file_stat, new_file_stat,
303                         DiffStat.MODIFIED)
304
305    try:
306      with open(old_file_path, encoding='utf-8') as f1:
307        old_lines = f1.readlines()
308      old_file_stat.line_cnt = len(old_lines)
309      with open(new_file_path, encoding='utf-8') as f2:
310        new_lines = f2.readlines()
311      new_file_stat.line_cnt = len(new_lines)
312      diff_lines = list(
313          difflib.context_diff(old_lines, new_lines, old_file_path.name,
314                               new_file_path.name))
315      file_type = FileStat.TEXT
316      if diff_lines:
317        self._diff_lines.extend(diff_lines)
318        diff_stat.add_diff_stat(diff_lines)
319      else:
320        print('WARNING: no diff lines on {0} {1}'.format(
321            old_file_path, new_file_path))
322
323    except UnicodeDecodeError:
324      file_type = FileStat.NON_TEXT
325
326    diff_stat.file_type = file_type
327    self._diff_stats.append(diff_stat)
328
329  def _get_filtered_files(self, dir_path):
330    """Returns a filtered file list."""
331    flist = []
332    for f in dir_path.glob('*'):
333      if f.name not in self._ignores:
334        if f.is_symlink():
335          print('SKIP: symlink: %s' % f)
336          continue
337        else:
338          flist.append(f)
339    return flist
340
341
342def write_file(file, lines, header=None):
343  """Write lines into a file."""
344
345  with open(file, 'w') as f:
346    if header:
347      f.write(header)
348
349    f.writelines(lines)
350  print('OUTPUT: {0}, {1} lines'.format(file, len(lines)))
351
352
353def main():
354  parser = argparse.ArgumentParser(
355      'Generate a diff stat cvs file for 2 versions of a codebase')
356  parser.add_argument('--old_dir', help='the old version codebase dir')
357  parser.add_argument('--new_dir', help='the new version codebase dir')
358  parser.add_argument(
359      '--csv_file', required=False, help='the diff stat cvs file if to create')
360  parser.add_argument(
361      '--diff_output_file',
362      required=False,
363      help='the diff output file if to create')
364  parser.add_argument(
365      '--ignores',
366      required=False,
367      default='.repo,.git,.github,.idea,__MACOSX,.prebuilt_info',
368      help='names to ignore')
369  parser.add_argument(
370      '--state_filter',
371      required=False,
372      default='1,2,3',
373      help='csv diff states to process, 0:SAME, 1:NEW, 2:REMOVED, 3:MODIFIED, '
374      '4:INCOMPARABLE')
375
376  args = parser.parse_args()
377
378  if not os.path.isdir(args.old_dir):
379    print('ERROR: %s does not exist.' % args.old_dir)
380    exit()
381
382  if not os.path.isdir(args.new_dir):
383    print('ERROR: %s does not exist.' % args.new_dir)
384    exit()
385
386  change_report = ChangeReport(args.old_dir, args.new_dir, args.ignores,
387                               args.state_filter)
388  if args.csv_file:
389    write_file(
390        args.csv_file,
391        change_report.get_diff_stat_lines(),
392        header=ChangeReport.get_diff_stat_header())
393
394  if args.diff_output_file:
395    write_file(args.diff_output_file, change_report.get_diff_lines())
396
397
398if __name__ == '__main__':
399  main()
400