1#!/usr/bin/env python3
2#
3# Copyright (C) 2012 The Android Open Source Project
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9#      http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16"""
17Usage: generate-notice-files --text-output [plain text output file] \
18               --html-output [html output file] \
19               --xml-output [xml output file] \
20               -t [file title] -s [directory of notices]
21
22Generate the Android notice files, including both text and html files.
23
24-h to display this usage message and exit.
25"""
26from collections import defaultdict
27import argparse
28import hashlib
29import itertools
30import os
31import os.path
32import re
33import struct
34import sys
35
36MD5_BLOCKSIZE = 1024 * 1024
37HTML_ESCAPE_TABLE = {
38    b"&": b"&",
39    b'"': b""",
40    b"'": b"'",
41    b">": b">",
42    b"<": b"&lt;",
43    }
44
45def md5sum(filename):
46    """Calculate an MD5 of the file given by FILENAME,
47    and return hex digest as a string.
48    Output should be compatible with md5sum command"""
49
50    f = open(filename, "rb")
51    sum = hashlib.md5()
52    while 1:
53        block = f.read(MD5_BLOCKSIZE)
54        if not block:
55            break
56        sum.update(block)
57    f.close()
58    return sum.hexdigest()
59
60
61def html_escape(text):
62    """Produce entities within text."""
63    # Using for i in text doesn't work since i will be an int, not a byte.
64    # There are multiple ways to solve this, but the most performant way
65    # to iterate over a byte array is to use unpack. Using the
66    # for i in range(len(text)) and using that to get a byte using array
67    # slices is twice as slow as this method.
68    return b"".join(HTML_ESCAPE_TABLE.get(i,i) for i in struct.unpack(str(len(text)) + 'c', text))
69
70HTML_OUTPUT_CSS=b"""
71<style type="text/css">
72body { padding: 0; font-family: sans-serif; }
73.same-license { background-color: #eeeeee; border-top: 20px solid white; padding: 10px; }
74.label { font-weight: bold; }
75.file-list { margin-left: 1em; color: blue; }
76</style>
77
78"""
79
80def combine_notice_files_html(file_hash, input_dirs, output_filename):
81    """Combine notice files in FILE_HASH and output a HTML version to OUTPUT_FILENAME."""
82
83    SRC_DIR_STRIP_RE = re.compile("(?:" + "|".join(input_dirs) + ")(/.*).txt")
84
85    # Set up a filename to row id table (anchors inside tables don't work in
86    # most browsers, but href's to table row ids do)
87    id_table = {}
88    id_count = 0
89    for value in file_hash:
90        for filename in value:
91             id_table[filename] = id_count
92        id_count += 1
93
94    # Open the output file, and output the header pieces
95    output_file = open(output_filename, "wb")
96
97    output_file.write(b"<html><head>\n")
98    output_file.write(HTML_OUTPUT_CSS)
99    output_file.write(b'</head><body topmargin="0" leftmargin="0" rightmargin="0" bottommargin="0">\n')
100
101    # Output our table of contents
102    output_file.write(b'<div class="toc">\n')
103    output_file.write(b"<ul>\n")
104
105    # Flatten the list of lists into a single list of filenames
106    sorted_filenames = sorted(itertools.chain.from_iterable(file_hash))
107
108    # Print out a nice table of contents
109    for filename in sorted_filenames:
110        stripped_filename = SRC_DIR_STRIP_RE.sub(r"\1", filename)
111        output_file.write(('<li><a href="#id%d">%s</a></li>\n' % (id_table.get(filename), stripped_filename)).encode())
112
113    output_file.write(b"</ul>\n")
114    output_file.write(b"</div><!-- table of contents -->\n")
115    # Output the individual notice file lists
116    output_file.write(b'<table cellpadding="0" cellspacing="0" border="0">\n')
117    for value in file_hash:
118        output_file.write(b'<tr id="id%d"><td class="same-license">\n' % id_table.get(value[0]))
119        output_file.write(b'<div class="label">Notices for file(s):</div>\n')
120        output_file.write(b'<div class="file-list">\n')
121        for filename in value:
122            output_file.write(("%s <br/>\n" % SRC_DIR_STRIP_RE.sub(r"\1", filename)).encode())
123        output_file.write(b"</div><!-- file-list -->\n")
124        output_file.write(b"\n")
125        output_file.write(b'<pre class="license-text">\n')
126        with open(value[0], "rb") as notice_file:
127            output_file.write(html_escape(notice_file.read()))
128        output_file.write(b"\n</pre><!-- license-text -->\n")
129        output_file.write(b"</td></tr><!-- same-license -->\n\n\n\n")
130
131    # Finish off the file output
132    output_file.write(b"</table>\n")
133    output_file.write(b"</body></html>\n")
134    output_file.close()
135
136def combine_notice_files_text(file_hash, input_dirs, output_filename, file_title):
137    """Combine notice files in FILE_HASH and output a text version to OUTPUT_FILENAME."""
138
139    SRC_DIR_STRIP_RE = re.compile("(?:" + "|".join(input_dirs) + ")(/.*).txt")
140    output_file = open(output_filename, "wb")
141    output_file.write(file_title.encode())
142    output_file.write(b"\n")
143    for value in file_hash:
144        output_file.write(b"============================================================\n")
145        output_file.write(b"Notices for file(s):\n")
146        for filename in value:
147            output_file.write(SRC_DIR_STRIP_RE.sub(r"\1", filename).encode())
148            output_file.write(b"\n")
149        output_file.write(b"------------------------------------------------------------\n")
150        with open(value[0], "rb") as notice_file:
151            output_file.write(notice_file.read())
152            output_file.write(b"\n")
153    output_file.close()
154
155def combine_notice_files_xml(files_with_same_hash, input_dirs, output_filename):
156    """Combine notice files in FILE_HASH and output a XML version to OUTPUT_FILENAME."""
157
158    SRC_DIR_STRIP_RE = re.compile("(?:" + "|".join(input_dirs) + ")(/.*).txt")
159
160    # Set up a filename to row id table (anchors inside tables don't work in
161    # most browsers, but href's to table row ids do)
162    id_table = {}
163    for file_key, files in files_with_same_hash.items():
164        for filename in files:
165             id_table[filename] = file_key
166
167    # Open the output file, and output the header pieces
168    output_file = open(output_filename, "wb")
169
170    output_file.write(b'<?xml version="1.0" encoding="utf-8"?>\n')
171    output_file.write(b"<licenses>\n")
172
173    # Flatten the list of lists into a single list of filenames
174    sorted_filenames = sorted(id_table.keys())
175
176    # Print out a nice table of contents
177    for filename in sorted_filenames:
178        stripped_filename = SRC_DIR_STRIP_RE.sub(r"\1", filename)
179        output_file.write(('<file-name contentId="%s">%s</file-name>\n' % (id_table.get(filename), stripped_filename)).encode())
180    output_file.write(b"\n\n")
181
182    processed_file_keys = []
183    # Output the individual notice file lists
184    for filename in sorted_filenames:
185        file_key = id_table.get(filename)
186        if file_key in processed_file_keys:
187            continue
188        processed_file_keys.append(file_key)
189
190        output_file.write(('<file-content contentId="%s"><![CDATA[' % file_key).encode())
191        with open(filename, "rb") as notice_file:
192            output_file.write(html_escape(notice_file.read()))
193        output_file.write(b"]]></file-content>\n\n")
194
195    # Finish off the file output
196    output_file.write(b"</licenses>\n")
197    output_file.close()
198
199def get_args():
200    parser = argparse.ArgumentParser()
201    parser.add_argument(
202        '--text-output', required=True,
203        help='The text output file path.')
204    parser.add_argument(
205        '--html-output',
206        help='The html output file path.')
207    parser.add_argument(
208        '--xml-output',
209        help='The xml output file path.')
210    parser.add_argument(
211        '-t', '--title', required=True,
212        help='The file title.')
213    parser.add_argument(
214        '-s', '--source-dir', required=True, action='append',
215        help='The directory containing notices.')
216    parser.add_argument(
217        '-i', '--included-subdirs', action='append',
218        help='The sub directories which should be included.')
219    parser.add_argument(
220        '-e', '--excluded-subdirs', action='append',
221        help='The sub directories which should be excluded.')
222    return parser.parse_args()
223
224def main(argv):
225    args = get_args()
226
227    txt_output_file = args.text_output
228    html_output_file = args.html_output
229    xml_output_file = args.xml_output
230    file_title = args.title
231    included_subdirs = []
232    excluded_subdirs = []
233    if args.included_subdirs is not None:
234        included_subdirs = args.included_subdirs
235    if args.excluded_subdirs is not None:
236        excluded_subdirs = args.excluded_subdirs
237
238    input_dirs = [os.path.normpath(source_dir) for source_dir in args.source_dir]
239    # Find all the notice files and md5 them
240    files_with_same_hash = defaultdict(list)
241    for input_dir in input_dirs:
242        for root, dir, files in os.walk(input_dir):
243            for file in files:
244                matched = True
245                if len(included_subdirs) > 0:
246                    matched = False
247                    for subdir in included_subdirs:
248                        if (root == (input_dir + '/' + subdir) or
249                            root.startswith(input_dir + '/' + subdir + '/')):
250                            matched = True
251                            break
252                elif len(excluded_subdirs) > 0:
253                    for subdir in excluded_subdirs:
254                        if (root == (input_dir + '/' + subdir) or
255                            root.startswith(input_dir + '/' + subdir + '/')):
256                            matched = False
257                            break
258                if matched and file.endswith(".txt"):
259                    filename = os.path.join(root, file)
260                    file_md5sum = md5sum(filename)
261                    files_with_same_hash[file_md5sum].append(filename)
262
263    filesets = [sorted(files_with_same_hash[md5]) for md5 in sorted(list(files_with_same_hash))]
264    combine_notice_files_text(filesets, input_dirs, txt_output_file, file_title)
265
266    if html_output_file is not None:
267        combine_notice_files_html(filesets, input_dirs, html_output_file)
268
269    if xml_output_file is not None:
270        combine_notice_files_xml(files_with_same_hash, input_dirs, xml_output_file)
271
272if __name__ == "__main__":
273    main(sys.argv)
274