1#!/usr/bin/python3
2#
3# Copyright 2022-2023 The Khronos Group Inc.
4# SPDX-License-Identifier: Apache-2.0
5
6"""Used to convert files from the asciidoctor spec tree to Antora module
7format. Success is highly dependent on strict adherence to Vulkan spec
8authoring conventions.
9
10Usage: `antora-prep.py [-root path] -component path files`
11
12- `-root` is the root path (repository root, usually) relative to which spec
13  files are processed. Defaults to current directory if not specified.
14- `-component` is the path to the module and component in which converted
15  files are written (e.g. the component directory under which pages/,
16  partials/, images/, etc. are located).
17- `files` are asciidoc source files from the spec to convert.
18
19Image files are linked from the component 'images' directory
20
21Asciidoc markup files (.adoc) are scanned for the first title markup and
22classified as partials or pages depending on whether it is a top-level title
23or not. All .adoc files are rewritten to the component 'partials' directory, to
24allow transclusion of pages to work (otherwise the transclusions would also
25have to be rewritten).
26
27pages then have additional markup injected immediately following the page
28title to set custom attributes needed for the build. pages are then
29symbolically linked from the component 'pages' directory to the actual
30rewritten file in the 'partials' directory to follow Antora conventions.
31"""
32
33# For error and file-loading interfaces only
34import argparse
35import importlib
36import os
37import re
38import sys
39from generator import enquote
40from reflib import loadFile, logDiag, logWarn, logErr, setLogFile, getBranch
41from pathlib import Path
42
43titleAnchorPat = re.compile(r'^\[\[(?P<anchor>[^,]+).*\]\]$')
44titlePat = re.compile(r'^[=#] (?P<title>[A-Z].*)')
45subtitlePat = re.compile(r'^[=#]{2,} (?P<title>[A-Z].*)')
46
47Pages = 'pages'
48Partials = 'partials'
49Images = 'images'
50
51def undefquote(s):
52    """Quote a string for JavaScript, or return the JavaScript undefined
53       value."""
54
55    if s is not None:
56        return enquote(s)
57    else:
58        return 'undefined'
59
60
61def mapAnchor(anchor, title, pageMap, xrefMap, closeAnchor):
62    """Rewrite a <<anchor{, title}>> xref -> xref:pagemap#anchor[{title}]
63        - anchor - anchor name
64        - title - xref description or '' if not specified, in which case the
65          anchor text from the xrefMap is used if available
66        - closeAnchor - True if closing >> is on this line, False otherwise
67        - pageMap, xrefMap - per rewriteXrefs below
68    """
69
70    #@if anchor == 'features-shaderStorageImageReadWithoutFormat':
71    #@    import pdb
72    #@    pdb.set_trace()
73
74    # Determine which page anchor this anchor comes from
75    # If it cannot be determined, use the unmapped anchor
76    #@ Simplify the page anchor if pageName == current page
77    try:
78        if title != '' or not closeAnchor:
79            # Either a (possibly up to a line break) title is supplied, or
80            # title is on the next line
81            (pageAnchor, _) = xrefMap[anchor]
82        else:
83            # No explicit title. Infer one from anchor and xrefMap.
84            (pageAnchor, title) = xrefMap[anchor]
85
86            # If the title is *still* empty, make a note of it and just use
87            # the anchor name
88            if title == '':
89                print(f'No title found for anchor {anchor}', file=sys.stderr)
90                title = anchor
91
92        # Page the page anchor comes from
93        pageName = pageMap[pageAnchor]
94        print(f'mapAnchor: anchor {anchor} pageAnchor {pageAnchor} -> pageName = {pageName}')
95
96        xref = f'{pageName}#{anchor}'
97    except:
98        print(f'Cannot determine which page {anchor} comes from, passing through to Antora intact', file=sys.stderr)
99        xref = f'{anchor}'
100
101    # Remove extraneous whitespace
102    title = ' '.join(title.split())
103
104    if closeAnchor:
105        return f'xref:{xref}[{title}]'
106    else:
107        return f'xref:{xref}[{title}'
108
109def replaceAnchorText(match, pageMap, xrefMap):
110    """Rewrite <<anchor,text>> to xref:newanchor[text]
111        - match - match object, \1 = anchor, \2 = text
112        - pageMap, xrefMap - per rewriteXrefs below
113    """
114
115    anchor = match.group(1)
116    text = match.group(2)
117
118    return mapAnchor(anchor, text, pageMap, xrefMap, closeAnchor=True)
119
120def replaceAnchorOnly(match, pageMap, xrefMap):
121    """Rewrite <<anchor>> to xref:newanchor[]
122        - match - match object, \1 = anchor
123        - pageMap, xrefMap - per rewriteXrefs below
124    """
125
126    anchor = match.group(1)
127
128    return mapAnchor(anchor, '', pageMap, xrefMap, closeAnchor=True)
129
130def replaceAnchorTrailingText(match, pageMap, xrefMap):
131    """Rewrite <<anchor, to xref:newanchor[
132        - match - match object, \1 = anchor, \2 = text (may be empty)
133        - pageMap, xrefMap - per rewriteXrefs below
134    """
135
136    anchor = match.group(1)
137    text = match.group(2)
138
139    return mapAnchor(anchor, text, pageMap, xrefMap, closeAnchor=False)
140
141class DocFile:
142    """Information about a markup file being converted"""
143
144    def __init__(self):
145        """Constructor
146           - lines - text of file as list of strings
147           - root - common base directory for src files
148           - component - path to component directory for outputs
149           - srcpath - absolute path to file source
150           - relpath - path to file source relative to root
151           - dstpath - path to output file destination
152           - dstlink - path to a an alias (symlink to) dstpath, used for
153             files that need to be in both partials and pages directories.
154           - category - file type - Pages, Partials, or Images. These are
155             string variables containing the corresponding component
156             subdirectory name.
157           - title - page title for Pages, else ''
158           - titleAnchor - page title anchor for Pages, else ''
159           - anchors - asciidoc anchors found in the file
160           - includes - asciidoc includes found in the file
161           - pageMap - dictionary mapping a page anchor to a source file
162             relpath
163           - xrefMap - dictionary mapping an anchor within a page to a page
164             anchor
165        """
166
167        self.lines = None
168        self.root = None
169        self.component = None
170        self.srcpath = None
171        self.relpath = None
172        self.dstpath = None
173        self.dstlink = None
174        self.category = None
175        self.title = ''
176        self.titleAnchor = ''
177        self.anchors = set()
178        self.includes = set()
179
180        self.pageMap = {}
181        self.xrefMap = {}
182
183    def findTitle(self):
184        """Find category (Pages or Partials) and title, for Pages, in a
185           .adoc markup file.
186
187           Heuristic is to search the beginning of the file for a top-level
188           asciidoc title, preceded immediately by an anchor for the page.
189
190           Returns (category, title, titleLine, titleAnchor) with '' for a
191           Partials title and '' if no title anchor is found."""
192
193        """Chapter title block must be within this many lines of start of file"""
194        maxLines = min(30, len(self.lines))
195
196        """Default, if page title and/or page anchor not found"""
197        titleAnchor = ''
198        title = ''
199
200        for lineno in range(0, maxLines):
201            line = self.lines[lineno]
202
203            # Look for the first anchor, which must precede the title to
204            # apply to it (really, must precede it by exactly one line).
205            match = titleAnchorPat.match(line)
206            if match is not None:
207                titleAnchor = match.group('anchor')
208                continue
209
210            # If we find a top-level title, it is a page.
211            match = titlePat.match(line)
212            if match is not None:
213                return (Pages, match.group('title'), lineno, titleAnchor)
214
215            # If we find a second-level or above title, it is a partial
216            match = subtitlePat.match(line)
217            if match is not None:
218                return (Partials, match.group('title'), lineno, titleAnchor)
219
220        # If we do not find a match in the first maxLines lines, assume it
221        # is a partial.
222        return(Partials, 'NO TITLE FOUND', -1, titleAnchor)
223
224    def populate(self,
225                 filename,
226                 root,
227                 component):
228        """Populate data structures given file content and location.
229
230           - filename - file to scan
231           - root - absolute path to root under which all source files are
232             read
233           - component - absolute path to module / component directory under
234             which all destination files are written
235        """
236
237        # Load file content
238        self.srcpath = os.path.abspath(filename)
239        self.lines, _ = loadFile(self.srcpath)
240        if self.lines is None:
241            raise RuntimeError(f'No such file {self.srcpath}')
242
243        # Miscellaneous relevant paths
244        self.root = root
245        self.relpath = os.path.relpath(self.srcpath, root)
246        self.component = component
247
248        # Determine file category.
249        # Only .adoc files are candidates for pages, which is verified by
250        # looking at the file header for a top-level title.
251        # .svg .jpg .png are always images
252        # Anything else is a partial
253        (_, fileext) = os.path.splitext(filename)
254
255        # Defaults
256        self.title = ''
257        self.titleLine = 0
258        self.titleAnchor = None
259
260        if fileext in (('.svg', '.jpg', '.png')):
261            self.category = Images
262        elif fileext == '.adoc':
263            (self.category,
264             self.title,
265             self.titleLine,
266             self.titleAnchor) = self.findTitle()
267        else:
268            self.category = Partials
269
270        # Determine destination path based on category
271        # images/ are treated specially since there is only a single
272        # directory and the component directory is already named Images.
273        if self.category == Partials:
274            self.dstpath = Path(self.component) / Partials / self.relpath
275        elif self.category == Pages:
276            # Save the page in partials/, link from pages/
277            self.dstpath = Path(self.component) / Partials / self.relpath
278            self.dstlink = Path(self.component) / Pages / self.relpath
279        else:
280            # Images go under images/, not under images/images/
281            # This could fail if there were ever top-level images but as all
282            # images used in the spec are required to be specified relative
283            # to {images}, it is OK.
284            self.dstpath = Path(self.component) / self.relpath
285
286
287    def rewriteXrefs(self, pageMap = {}, xrefMap = {}):
288        """Rewrite asciidoc <<>> xrefs into Antora xref: xrefs, including
289           altering the xref target.
290
291           - pageMap - map from page anchors to page names
292           - xrefMap - map from anchors within a page to the page anchor"""
293
294        # pageMap and xrefMap are used in functions called by re.subn, so
295        # save them in members.
296        self.pageMap = pageMap
297        self.xrefMap = xrefMap
298
299        # Xref markup may be broken across lines, and may or may not include
300        # anchor text. Track whether the closing >> is being looked for at
301        # start of line, or not.
302        withinXref = False
303
304        for lineno in range(0, len(self.lines)):
305            line = self.lines[lineno]
306
307            if withinXref:
308                # Could use line.replace, but that does not return a match
309                # count, so we cannot tell if the '>>' is missing.
310                (line, count) = re.subn(r'>>', r']', line, count=1)
311                if count == 0:
312                    print(f'WARNING: No closing >> found on line {lineno} of {self.relpath}', file=sys.stderr)
313                elif line[0] != ' ' and self.lines[lineno-1][-1] not in '[ ':
314                    # Add whitespace corresponding to crushed-out newline on
315                    # previous line, so title words do not run together.
316                    self.lines[lineno-1] += ' '
317                withinXref = False
318
319            # Now look for all xrefs starting on this line and remap them,
320            # including remapping the anchor.
321
322            # First, complete xrefs with alt-text (<<anchor, text>>)
323            (line, count) = re.subn(r'<<([^,>]*),([^>]+)>>',
324                lambda match: replaceAnchorText(match, pageMap, xrefMap),
325                line)
326
327            # Next, complete xrefs without alt-text (<<anchor>>)
328            (line, count) = re.subn(r'<<([^,>]*)>>',
329                lambda match: replaceAnchorOnly(match, pageMap, xrefMap),
330                line)
331
332            # Finally, if there is a trailing '<<anchor,' at EOL, remap it
333            # and set the flag so the terminating '>>' on the next line will
334            # be mapped into an xref closing ']'.
335            (line, count) = re.subn(r'<<([^,>]*),([^>]*)$',
336                lambda match: replaceAnchorTrailingText(match, pageMap, xrefMap),
337                line)
338            if count > 0:
339                withinXref = True
340
341            self.lines[lineno] = line
342
343    def __str__(self):
344        lines = [
345            f'Input file {filename}: {len(self.lines)} lines',
346            f'root = {self.root} component = {self.component} relpath = {self.relpath}',
347            f'category = {self.category} dstpath = {self.dstpath}',
348            f'title = {self.title}',
349            f'titleAnchor = {self.titleAnchor}',
350        ]
351        return '\n'.join(lines)
352
353    def removeDestination(self, path, text, overwrite):
354        """Remove a destination file, if it exists and overwrite is true.
355           Ensure the destination directory exists.
356
357            path - file pathname
358            text - descriptive text for errors
359            overwrite - if True, replace existing output file
360        """
361
362        if os.path.exists(path):
363            if overwrite:
364                # print(f'Removing {text}: {path}')
365                os.remove(path)
366            else:
367                raise RuntimeError(f'Will not overwrite {text}: {path}')
368
369        dir = os.path.dirname(path)
370        if not os.path.exists(dir):
371            # print(f'Creating {text} directory {dir}')
372            os.makedirs(dir)
373
374    def rewriteFile(self, overwrite = True, pageHeaders = None):
375        """Write source file to component directory. Images are just symlinked
376           to the external file. Pages are rewritten to Partials, then
377           symlinked to Pages.
378
379           - overwrite - if True, replace existing output files
380           - pageHeaders - if not None, a list of strings to inject
381             following the chapter heading in each page
382
383           <<>>-style xrefs are assumed to be rewritten prior to calling
384           rewriteFile.
385
386           May still need to rewrite custom macros.
387        """
388
389        self.removeDestination(self.dstpath, 'destination file', overwrite)
390
391        if self.category == Images:
392            # Just symlink destination image to source
393            # print(f'Symlinking {self.dstpath} -> {self.srcpath}')
394            os.symlink(self.srcpath, self.dstpath)
395        elif self.category == Partials:
396            self.writeFile(self.dstpath)
397        elif self.category == Pages:
398            if pageHeaders is not None:
399                # Add blank lines before and after the pageHeaders to avoid
400                # coalescing with file content.
401                lines = self.lines[0:self.titleLine+1]
402                lines += ['\n'] + pageHeaders + ['\n']
403                lines = lines + self.lines[self.titleLine+1:]
404                self.lines = lines
405
406            # Inject page headers immediately following page title
407
408            self.writeFile(self.dstpath)
409
410            if self.dstlink is None:
411                RuntimeError(f'Wrote Page {self.dstpath} to Partials, but no Pages link supplied')
412            else:
413                self.removeDestination(self.dstlink, 'destination link', overwrite)
414                os.symlink(self.dstpath, self.dstlink)
415
416    def writeFile(self, path):
417        """Write self.lines[] to file at specified path"""
418
419        try:
420            fp = open(path, 'w', encoding='utf8')
421        except:
422            raise RuntimeError(f'Cannot open output file {path}')
423
424        for line in self.lines:
425            print(line, file=fp, end='')
426
427        fp.close()
428
429def testHarness():
430    def printFile(label, lines):
431        print(label)
432        print('------------------')
433        for line in lines:
434            print(line)
435
436    # Test harness
437    docFile = DocFile()
438    docFile.lines = [
439        '<<ext,ext chapter>> <<ext-label,',
440        'ext chapter/label>>',
441        '<<core>>, <<core-label, core chapter/label',
442        '>>'
443    ]
444
445    pageMap = {
446        'ext'  : 'file/ext.adoc',
447        'core' : 'file/core.adoc',
448    }
449    xrefMap = {
450        'ext'       : [ 'ext', '' ],
451        'ext-label' : [ 'ext', 'LABELLED ext-label' ],
452        'core'      : [ 'core', 'Core Title' ],
453        'core-label': [ 'core', 'Core Label Title' ],
454    }
455
456    printFile('Original File', docFile.lines)
457
458    docFile.rewriteXrefs(pageMap, xrefMap)
459
460    printFile('Edited File', docFile.lines)
461
462if __name__ == '__main__':
463    parser = argparse.ArgumentParser()
464
465    parser.add_argument('-root', action='store', dest='root',
466                        default=os.getcwd(),
467                        help='Specify root directory under which files are located (default current directory)')
468    parser.add_argument('-pageHeaders', action='store', dest='pageHeaders',
469                        default=None,
470                        help='Specify file whose contents are injected after title of each converted page')
471    parser.add_argument('-component', action='store', dest='component',
472                        required=True,
473                        help='Specify module / component directory in which converted files are written')
474    #parser.add_argument('-htmlspec', action='store', dest='htmlspec',
475    #                    default=None, required=False,
476    #                    help='Specify HTML of generated spec to extract anchor mapping from')
477    parser.add_argument('-xrefpath', action='store', dest='xrefpath',
478                        default=None, required=False,
479                        help='Specify path to xrefMap.py containing map of anchors to chapter anchors')
480    parser.add_argument('-pagemappath', action='store', dest='pagemappath',
481                        default=None, required=False,
482                        help='Specify path to output pageMap.cjs containing map of anchors to chapter anchors')
483    parser.add_argument('-filelist', action='store',
484                        default=None, required=False,
485                        help='Specify file containing a list of filenames to convert, one/line')
486    parser.add_argument('files', metavar='filename', nargs='*',
487                        help='Specify name of a single file to convert')
488
489    args = parser.parse_args()
490
491    args.root = os.path.abspath(args.root)
492    args.component = os.path.abspath(args.component)
493
494    if args.pageHeaders is not None:
495        args.pageHeaders, _ = loadFile(args.pageHeaders)
496
497    if False:
498        testHarness()
499        sys.exit(0)
500
501    # Initialize dictionaries
502    pageInfo = {}
503    pageMap = {}
504
505    # The xrefmap is imported from the 'xrefMap' module, if it exists
506    try:
507        if args.xrefpath is not None:
508            sys.path.append(args.xrefpath)
509        from xrefMap import xrefMap
510    except:
511        print('WARNING: No module xrefMap containing xrefMap dictionary', file=sys.stderr)
512        xrefMap = {}
513
514    # If a file containing a list of files was specified, add each one.
515    # Could try using os.walk() instead, but that is very slow.
516    if args.filelist is not None:
517        count = 0
518        lines, _ = loadFile(args.filelist)
519        if lines is None:
520            raise RuntimeError(f'Error reading filelist {args.filelist}')
521        for line in lines:
522            path = line.rstrip()
523            if path[0].isalpha() and path.endswith('.adoc'):
524                args.files.append(path)
525                count = count + 1
526        print(f'Read {count} paths from {args.filelist}')
527
528    for filename in args.files:
529        # Create data structure representing the file.
530        docFile = DocFile()
531        docFile.populate(filename = filename,
532                         root = args.root,
533                         component = args.component)
534        # print(docFile, '\n')
535
536        # Save information about the file under its relpath
537        pageInfo[docFile.relpath] = docFile
538
539        # Save mapping from page anchor to its relpath
540        if docFile.titleAnchor is not None:
541            pageMap[docFile.titleAnchor] = docFile.relpath
542
543    # All files have been read and classified.
544    # Rewrite them in memory.
545
546    for key in pageInfo:
547        # Look for <<>>-style anchors and rewrite them to Antora xref-style
548        # anchors using the pageMap (of top-level anchors to page names) and
549        # xrefmap (of anchors to top-level anchors).
550        docFile = pageInfo[key]
551
552        ## print(f'*** Rewriting {key}')
553        ## print(docFile, '\n')
554
555        docFile.rewriteXrefs(pageMap, xrefMap)
556        docFile.rewriteFile(overwrite = True, pageHeaders = args.pageHeaders)
557
558    # Write the pageMap to a .cjs file for use in the Antora build's
559    # specmacros extensions. The xrefMap is already written in JS form.
560    if args.pagemappath is not None:
561        try:
562            fp = open(args.pagemappath, 'w', encoding='utf8')
563        except:
564            raise RuntimeError(f'Cannot open output pageMap.cjs file {args.pagemappath}')
565
566        print('exports.pageMap = {', file=fp)
567        for pageAnchor in sorted(pageMap):
568            pageName = pageMap[pageAnchor]
569            print(f'    {undefquote(pageAnchor)} : {undefquote(pageName)},', file=fp)
570        print('}', file=fp)
571
572        fp.close()
573
574##        if not os.path.exists(args.xrefmap):
575##            raise UserWarning(f'Specified xrefmap {args.xrefmap} does not exist')
576##        if args.xrefmap[-3:] != '.py':
577##            raise UserWarning(f'Specified xrefmap {args.xrefmap} is not a .py file')
578##
579##        abspath = os.path.abspath(args.xrefmap)
580##        xrefdir = os.path.dirname(os.path.abspath(args.xrefmap))
581##        sys.path.append(dir)
582##
583##        xrefbase = os.path.split(args.xrefmap)[1]
584##        xrefbase = os.path.splitext(xrefbase)[0]
585##
586##            raise UserWarning(f'Specified xrefmap {args.xrefmap} does not exist')
587