1#!/usr/bin/python3
2#
3# Copyright 2020-2023 The Khronos Group Inc.
4# SPDX-License-Identifier: Apache-2.0
5
6# map_html_anchors - map each id= element in a spec HTML file onto the
7# top-level (chapter) id= element it belongs to. Used to rewrite spec
8# xrefs for Antora. Prints a Python script containing a dictionary
9# mapping each discovered ID into the top-level ID it belongs to, and the
10# corresponding title element following the id.
11#
12# This script is very specific to HTML generated by asciidoctor and
13# following conventions of the Vulkan style guide.
14
15# Usage: map_html_anchors.py file.html > xrefMap.py
16
17import argparse
18import re
19import sys
20from lxml import etree
21
22def contains_any_of(words, wordlist):
23    """Returns True if any element of 'word' is contained in 'words'
24
25       - words - iterable of words to check against wordlist
26       - wordlist - iterable of words"""
27
28    for word in words:
29        if word in wordlist:
30            return True
31    return False
32
33sectNumberPat = re.compile(r'^(Table |)([0-9]+\.)+ *')
34
35def add_id(chapelem, idelem, id_map, chapter_id):
36    """Add a ID -> [ chapter ID, title] mapping.
37
38       - chapelem - Element for the chapter containing this ID
39       - idelem - Element for the ID itself
40       - id_map - dictionary containing the map
41       - chapter_id - chapter ID of chapelem"""
42
43    # The actual ID
44    id = idelem.get('id')
45
46    # Try to determine the title corresponding to this ID, or '' otherwise
47    if idelem.tag == 'a':
48        # <a id=> does not have a corresponding title element
49        id_title = ''
50    elif idelem.tag in (('h2', 'h3', 'h4', 'h4', 'h5', 'h6')):
51        # <h# id=> has ((#.)* *title) in the text of its element
52        id_title = ''.join(idelem.itertext())
53    elif idelem.tag == 'table':
54        # <table id=> may be followed by <caption class="title">
55        # with 'Table ##. caption' text
56        capelem = idelem.find('.//caption[@class="title"]')
57        if capelem is not None:
58            id_title = ''.join(capelem.itertext())
59        else:
60            id_title = 'NO TABLE CAPTION FOUND'
61    elif idelem.tag == 'div':
62        classes = idelem.get('class')
63        if classes is not None:
64            divclass = classes.split()
65
66            if contains_any_of((('admonitionblock', 'paragraph', 'sidebarblock')), divclass):
67                # <div> classes with no title elements (paragraphs or NOTEs)
68                id_title = ''
69            elif 'listingblock' in divclass:
70                # <div id= class="listingblock"> has title == id (used for API includes)
71                id_title = id
72            elif contains_any_of((('dlist', 'openblock')), divclass):
73                # <div> classes with titles in the text of the first
74                # <dt class="hdlist1"> element of the div
75                #
76                # "dlist" are mostly glossary elements
77                # "openblock" are mostly SPIR-V keywords
78                dtelem = idelem.find('.//dt[@class="hdlist1"]')
79                if dtelem is not None:
80                    # This may not find text in child Elements of <dt>
81                    id_title = ''.join(dtelem.itertext())
82                else:
83                    # No dtelem text found, this probably means a label on an
84                    # API open block
85                    id_title = ''
86            elif contains_any_of((('ulist', 'imageblock')), divclass):
87                # <div> classes with titles in the first
88                # <div class="title"> element of the div
89                titleelem = idelem.find('.//div[@class="title"]')
90                if titleelem is not None:
91                    id_title = ''.join(titleelem.itertext())
92                else:
93                    # No <div class="title"> text found
94                    id_title = ''
95            else:
96                id_title = ''
97                print(f'Cannot find title for <div id="{id}" class="{classes}"> - unrecognized class', file=sys.stderr)
98        else:
99            # <div id=> without a class may have a corresponding <h# id=> with the
100            # same id - in this case, the div will be thrown away when the
101            # following element is encountered.
102            id_title = ''
103
104    if id in id_map:
105        val = id_map[id]
106        print(f'Replacing key {id} -> ({val[0]}, {val[1]}) with ({chapter_id}, {id_title})', file=sys.stderr)
107
108    # Strip whitespace and leading table or section numbers, if present
109    id_title = sectNumberPat.sub('', id_title.strip())
110
111    # Map the xref to the chapter it came from and its title
112    id_map[id] = [ chapter_id, id_title ]
113
114def generate_map(id_map, filename, scripttype):
115    """Encode the ID map into the specified scripttype ('python' or
116       'javascript') in the specified file."""
117
118    fp = open(filename, 'w')
119
120    # Python and JS are extremely similar when the output is just a
121    # dictionary of lists of strings.
122
123    if scripttype == 'javascript':
124        print('exports.xrefMap = {', file=fp)
125    else:
126        print('xrefMap = {', file=fp)
127
128    # Sort keys so the can be compared between runs
129    for id in sorted(id_map):
130        print(f"    '{id}' : [ '{id_map[id][0]}', '{id_map[id][1]}' ],", file=fp)
131
132    print('}', file=fp)
133
134    fp.close()
135
136if __name__ == '__main__':
137    parser = argparse.ArgumentParser()
138
139
140    parser.add_argument('-jsfile', action='store',
141                        default=None,
142                        help='Specify name of JavaScript file to generate')
143    parser.add_argument('-pyfile', action='store',
144                        default=None,
145                        help='Specify name of Python file to generate')
146    parser.add_argument('files', metavar='filename', nargs=1,
147                        help='HTML spec file to map IDs from')
148    args = parser.parse_args()
149
150    # Tags whose id elements are anchors (we are not concerned about other
151    # tags such as <svg>).
152    idtags = (('a', 'div', 'h2', 'h3', 'h4', 'h4', 'h5', 'h6', 'table'))
153
154    # Tags whose id elements we do not care about ('h2' is a special case)
155    rejected_tags = (('svg',
156                      'circle',
157                      'clippath',
158                      'defs',
159                      'ellipse',
160                      'g',
161                      'grid',
162                      'lineargradient',
163                      'marker',
164                      'metadata',
165                      'namedview',
166                      'path',
167                      'path-effect',
168                      'rect',
169                      'stop',
170                      'text',
171                      'tspan',
172        ))
173
174    parser = etree.HTMLParser()
175
176    # There is exactly one HTML filename
177    filename = args.files[0]
178    tree = etree.parse(filename, parser)
179
180    # Dictionary mapping an ID (anchor) to [chapter ID, ID title],
181    # where 'chapter ID' is the ID of the chapter it appears in
182    id_map = {}
183
184    # Find each <div class="sect1"> element, which corresponds to a
185    # chapter.
186    chapter_elems = tree.findall('.//div[@class="sect1"]')
187    for chapelem in chapter_elems:
188        chapter_id = ''
189        h2_elems = chapelem.findall('.//h2[@id]')
190        if len(h2_elems) != 1:
191            raise UserWarning(f'Error! <div> must have exactly 1 <h2> element, has {len(h2_elems)}')
192        else:
193            chapter_id = h2_elems[0].get('id')
194
195        for idelem in chapelem.findall('.//*[@id]'):
196            if idelem.tag in idtags:
197                add_id(chapelem, idelem, id_map, chapter_id)
198                True
199            elif idelem.tag in rejected_tags:
200                # print(f'Rejecting tag {idelem.tag}')
201                # Do nothing - for tags we know we do not care about
202                True
203            else:
204                print(f'    Rejecting unknown tag with ID <{idelem.tag} id="{idelem.get("id")}"', file=sys.stderr)
205                True
206
207    if args.pyfile is not None:
208        generate_map(id_map, args.pyfile, 'python')
209    if args.jsfile is not None:
210        generate_map(id_map, args.jsfile, 'javascript')
211