1#!/usr/bin/python3 2# 3# Copyright 2020-2023 The Khronos Group Inc. 4# SPDX-License-Identifier: Apache-2.0 5 6# map_html_anchors - map each id= element in a spec HTML file onto the 7# top-level (chapter) id= element it belongs to. Used to rewrite spec 8# xrefs for Antora. Prints a Python script containing a dictionary 9# mapping each discovered ID into the top-level ID it belongs to, and the 10# corresponding title element following the id. 11# 12# This script is very specific to HTML generated by asciidoctor and 13# following conventions of the Vulkan style guide. 14 15# Usage: map_html_anchors.py file.html > xrefMap.py 16 17import argparse 18import re 19import sys 20from lxml import etree 21 22def contains_any_of(words, wordlist): 23 """Returns True if any element of 'word' is contained in 'words' 24 25 - words - iterable of words to check against wordlist 26 - wordlist - iterable of words""" 27 28 for word in words: 29 if word in wordlist: 30 return True 31 return False 32 33sectNumberPat = re.compile(r'^(Table |)([0-9]+\.)+ *') 34 35def add_id(chapelem, idelem, id_map, chapter_id): 36 """Add a ID -> [ chapter ID, title] mapping. 37 38 - chapelem - Element for the chapter containing this ID 39 - idelem - Element for the ID itself 40 - id_map - dictionary containing the map 41 - chapter_id - chapter ID of chapelem""" 42 43 # The actual ID 44 id = idelem.get('id') 45 46 # Try to determine the title corresponding to this ID, or '' otherwise 47 if idelem.tag == 'a': 48 # <a id=> does not have a corresponding title element 49 id_title = '' 50 elif idelem.tag in (('h2', 'h3', 'h4', 'h4', 'h5', 'h6')): 51 # <h# id=> has ((#.)* *title) in the text of its element 52 id_title = ''.join(idelem.itertext()) 53 elif idelem.tag == 'table': 54 # <table id=> may be followed by <caption class="title"> 55 # with 'Table ##. caption' text 56 capelem = idelem.find('.//caption[@class="title"]') 57 if capelem is not None: 58 id_title = ''.join(capelem.itertext()) 59 else: 60 id_title = 'NO TABLE CAPTION FOUND' 61 elif idelem.tag == 'div': 62 classes = idelem.get('class') 63 if classes is not None: 64 divclass = classes.split() 65 66 if contains_any_of((('admonitionblock', 'paragraph', 'sidebarblock')), divclass): 67 # <div> classes with no title elements (paragraphs or NOTEs) 68 id_title = '' 69 elif 'listingblock' in divclass: 70 # <div id= class="listingblock"> has title == id (used for API includes) 71 id_title = id 72 elif contains_any_of((('dlist', 'openblock')), divclass): 73 # <div> classes with titles in the text of the first 74 # <dt class="hdlist1"> element of the div 75 # 76 # "dlist" are mostly glossary elements 77 # "openblock" are mostly SPIR-V keywords 78 dtelem = idelem.find('.//dt[@class="hdlist1"]') 79 if dtelem is not None: 80 # This may not find text in child Elements of <dt> 81 id_title = ''.join(dtelem.itertext()) 82 else: 83 # No dtelem text found, this probably means a label on an 84 # API open block 85 id_title = '' 86 elif contains_any_of((('ulist', 'imageblock')), divclass): 87 # <div> classes with titles in the first 88 # <div class="title"> element of the div 89 titleelem = idelem.find('.//div[@class="title"]') 90 if titleelem is not None: 91 id_title = ''.join(titleelem.itertext()) 92 else: 93 # No <div class="title"> text found 94 id_title = '' 95 else: 96 id_title = '' 97 print(f'Cannot find title for <div id="{id}" class="{classes}"> - unrecognized class', file=sys.stderr) 98 else: 99 # <div id=> without a class may have a corresponding <h# id=> with the 100 # same id - in this case, the div will be thrown away when the 101 # following element is encountered. 102 id_title = '' 103 104 if id in id_map: 105 val = id_map[id] 106 print(f'Replacing key {id} -> ({val[0]}, {val[1]}) with ({chapter_id}, {id_title})', file=sys.stderr) 107 108 # Strip whitespace and leading table or section numbers, if present 109 id_title = sectNumberPat.sub('', id_title.strip()) 110 111 # Map the xref to the chapter it came from and its title 112 id_map[id] = [ chapter_id, id_title ] 113 114def generate_map(id_map, filename, scripttype): 115 """Encode the ID map into the specified scripttype ('python' or 116 'javascript') in the specified file.""" 117 118 fp = open(filename, 'w') 119 120 # Python and JS are extremely similar when the output is just a 121 # dictionary of lists of strings. 122 123 if scripttype == 'javascript': 124 print('exports.xrefMap = {', file=fp) 125 else: 126 print('xrefMap = {', file=fp) 127 128 # Sort keys so the can be compared between runs 129 for id in sorted(id_map): 130 print(f" '{id}' : [ '{id_map[id][0]}', '{id_map[id][1]}' ],", file=fp) 131 132 print('}', file=fp) 133 134 fp.close() 135 136if __name__ == '__main__': 137 parser = argparse.ArgumentParser() 138 139 140 parser.add_argument('-jsfile', action='store', 141 default=None, 142 help='Specify name of JavaScript file to generate') 143 parser.add_argument('-pyfile', action='store', 144 default=None, 145 help='Specify name of Python file to generate') 146 parser.add_argument('files', metavar='filename', nargs=1, 147 help='HTML spec file to map IDs from') 148 args = parser.parse_args() 149 150 # Tags whose id elements are anchors (we are not concerned about other 151 # tags such as <svg>). 152 idtags = (('a', 'div', 'h2', 'h3', 'h4', 'h4', 'h5', 'h6', 'table')) 153 154 # Tags whose id elements we do not care about ('h2' is a special case) 155 rejected_tags = (('svg', 156 'circle', 157 'clippath', 158 'defs', 159 'ellipse', 160 'g', 161 'grid', 162 'lineargradient', 163 'marker', 164 'metadata', 165 'namedview', 166 'path', 167 'path-effect', 168 'rect', 169 'stop', 170 'text', 171 'tspan', 172 )) 173 174 parser = etree.HTMLParser() 175 176 # There is exactly one HTML filename 177 filename = args.files[0] 178 tree = etree.parse(filename, parser) 179 180 # Dictionary mapping an ID (anchor) to [chapter ID, ID title], 181 # where 'chapter ID' is the ID of the chapter it appears in 182 id_map = {} 183 184 # Find each <div class="sect1"> element, which corresponds to a 185 # chapter. 186 chapter_elems = tree.findall('.//div[@class="sect1"]') 187 for chapelem in chapter_elems: 188 chapter_id = '' 189 h2_elems = chapelem.findall('.//h2[@id]') 190 if len(h2_elems) != 1: 191 raise UserWarning(f'Error! <div> must have exactly 1 <h2> element, has {len(h2_elems)}') 192 else: 193 chapter_id = h2_elems[0].get('id') 194 195 for idelem in chapelem.findall('.//*[@id]'): 196 if idelem.tag in idtags: 197 add_id(chapelem, idelem, id_map, chapter_id) 198 True 199 elif idelem.tag in rejected_tags: 200 # print(f'Rejecting tag {idelem.tag}') 201 # Do nothing - for tags we know we do not care about 202 True 203 else: 204 print(f' Rejecting unknown tag with ID <{idelem.tag} id="{idelem.get("id")}"', file=sys.stderr) 205 True 206 207 if args.pyfile is not None: 208 generate_map(id_map, args.pyfile, 'python') 209 if args.jsfile is not None: 210 generate_map(id_map, args.jsfile, 'javascript') 211