#!/usr/bin/python # -*- coding: utf-8 -*- # # Builds OpenPOWER Foundation documentation using standard template. # # Assumes rst2db has been used to convert rst to docbook. # import os, sys, getopt, shutil, errno, subprocess, copy, re from os import fdopen, remove from shutil import move from git import Repo from lxml import etree from conf import opf_docbook_settings, master_doc, project from subprocess import Popen, PIPE def copy_xml_to_template(src_dir, tgt_dir): # Copy XML files src_files = os.listdir(src_dir) for filename in src_files: full_file = os.path.join (src_dir, filename) if (os.path.isfile(full_file)): shutil.copy(full_file, tgt_dir) elif (os.path.isdir(full_file)): try: os.makedirs(os.path.join(tgt_dir,filename)) except OSError as exception: if exception.errno != errno.EEXIST: raise copy_xml_to_template( os.path.join(src_dir,filename), os.path.join(tgt_dir,filename) ) def update_file(filename, old_str, new_str): # Verify tag exists with open(filename) as f: s = f.read() if old_str not in s: print 'Error: "{old_str}" not found in {filename}.'.format(**locals()) sys.exit(-2) # Safely write the changed content, if found in the file with open(filename, 'w') as f: s = s.replace(old_str, new_str) f.write(s) def traverse_clean_html_source_examples(filename): temp_file = filename + '.tmp' code_found = False html_source_start_regex = '^
'
    html_source_stop_regex = '^
' span_regex = '\' print filename # Walk file by line with open(temp_file, 'w') as new_file: with open(filename) as old_file: for line in old_file: if re.match(html_source_start_regex,line): # print 'DEBUG: Code block start found' code_found = True elif re.match(html_source_stop_regex,line): # print 'DEBUG: Code block stop found' code_found = False if code_found: oldline = line # Remove references line = line.replace('', '') # Remove references line = re.sub(span_regex, '', line) # print 'DEBUG: line changed.\n Old: >' + oldline + '<\n New: >' + line + '<' new_file.write(line) # Preserve old file move(filename, filename + '.bak') # Move new file into old move(temp_file, filename) def traverse_clean_html_nodes(element): if 'ul' in element.tag and element.attrib: key = element.attrib.keys()[0] value = element.attrib[key] if 'id' in key: first_child = element.__getitem__(0); if first_child.__len__() == 0: print 'Error: Bad assumption.
    tag is empty.' # Add attribute to first_child and remove from element first_child.attrib[ key ] = value; del element.attrib[ key ] # print 'DEBUG:
      attributes: ', element.attrib # print 'DEBUG: child attributes: ', first_child.attrib sys.stderr.write( '**Information: id attribute on
        tag to first sub-element, <' + element.tag + '> for ' + key + ' = ' + value + '\n' ) for child in element.getchildren(): traverse_clean_html_nodes(child) def cleanup_html(infile, outfile): # Create internal representation of document from infile parser = etree.XMLParser(remove_comments=False) tree = etree.parse(infile, parser=parser) head = tree.getroot() # print_tree( head, 0, 2 ) # Walk nodes doing any cleanup traverse_clean_html_nodes(head) # Persist updates to output file tree.write(outfile) # Note: This invocation needs to occur post tree-write because # it will update file traverse_clean_html_source_examples(outfile) def find_match(reference, anchor_node, relationship): if not anchor_node is None and 'anchor' in anchor_node.tag: # Try this, verify matching ids key = anchor_node.attrib.keys()[0] value = anchor_node.attrib[key] regex = '^' + reference + '(\.\d+)?$' # print 'DEBUG: ' + relationship + ' anchor check. Reference: ' + reference + ' Regex: ' + regex + ' Value: ' + value if re.match(regex,value): return anchor_node else: # print 'DEBUG: Anchor in ' + relationship + ' tag does not match. Expected: ', reference, ' Found: ', value, ' Looking further...' node = anchor_node while not node.getprevious() is None: node = node.getprevious() if 'anchor' in node.tag: key = node.attrib.keys()[0] value = node.attrib[ key ] if re.match(regex,value): # print 'DEBUG: Anchor in ' + relationship + ' tag finally match!!!' return node # else # print 'DEBUG: Anchor in ' + relationship + ' tag does not match. Expected: ', reference, ' Found: ', value, ' Looking further...' else: # print 'DEBUG: Anchor in ' + relationship + ' tag does not match. Expected: ', reference, ' Found: ', value, ' Anchor node: ', node return None else: # print 'Error: find_match called with non-anchor element. Reference: ' + reference + ' Node: ' + anchor_node + ' Relationship: ' + relationship return None def traverse_clean_links(element): if 'link' in element.tag: # Note: Terminal tag, no need to recurse # Gather link details text = element.text num_attributes = element.attrib.__len__() reference = element.attrib.get('linkend',None) if num_attributes is 1 and not reference is None and text == u'¶': # Erroneous link message, find related anchor, could be "uncle" or "cousin" (of various degrees) anchor = None parent = element.getparent() grandparent = parent.getparent() greatuncle = grandparent.getprevious() # Check Great Uncle for match anchor = find_match(reference, greatuncle, 'Great Uncle') # If no match, locate "cousin" and if found, check it if anchor is None: cousin = None if not greatuncle is None: node = greatuncle while node.__len__() > 0 and cousin is None: node = node.__getitem__(node.__len__() -1) if 'anchor' in node.tag: cousin = node if not cousin is None: anchor = find_match(reference, cousin, 'Cousin') # If no match, try uncle if anchor is None: uncle = parent.getprevious() anchor = find_match(reference, uncle, 'Uncle') # Always delete tag of this type (contains only u'¶' for text) parent.__delitem__(parent.index(element)) if not anchor is None: # print 'MATCH FOUND: ', reference # Retrieve attribute key from anchor # Note: The key is always correctly set by herold in the case of duplicate keys. # The tag may have a "dot" and a number appended to value in . key = anchor.attrib.keys()[0] value = anchor.get(key) if 'title' in parent.tag: # Add id attribute to Grandparent grandparent.set(key,value) else: # Add id attribute to Parent parent.set(key,value) sys.stderr.write( '**Information: removed dummy link and for ' + reference + ' and added proper xml:id as ' + value + '\n' ) # Delete tag anchor_parent = anchor.getparent() anchor_parent.__delitem__(anchor_parent.index(anchor)) else: # Nothing more to do sys.stderr.write( '**Information: Matching element not found for reference = ' + reference + '. Link removed.' + '\n' ) else: for child in element.getchildren(): traverse_clean_links(child) def traverse_clean_other(element): if 'informalexample' in element.tag: # Get key elements around this one parent = element.getparent() grandparent = parent.getparent() # Create new elements -- section and title (use text from informal example element) new_section = parent.makeelement(grandparent.tag) new_title = parent.makeelement('title') title = element.text new_title.text = title # Add title to new section new_section.append(new_title) # Copy over children from to new
        for child in element.getchildren(): element.remove(child) new_section.append(child) # print 'DEBUG: old tree...' # print_tree(parent, 0, 2) # Add new
        as next sibling of parent and remove from parent parent.addnext(new_section) parent.remove(element) # print 'DEBUG: new tree...' # print_tree(parent.getparent(), 0, 3) sys.stderr.write( '**Information: ' + element.text + ' removed and promoted as
        with title: ' + title + '\n' ) elif 'note' in element.tag: # Get key elements around this one parent = element.getparent() grandparent = parent.getparent() # print 'DEBUG: old tree...' # print_tree(parent, 0, 4) # Create new elements -- section and title (use text from bridgehead subelement) new_section = parent.makeelement(parent.tag) bridgehead = element.__getitem__(0).__getitem__(0) if not 'bridgehead' in bridgehead.tag: print 'Error: Bad assumption about structure. Bridgehead not found as expected.' sys.exit(-20) title = bridgehead.text new_title = parent.makeelement('title') new_title.text = title # Add title to new section new_section.append(new_title) # Remove from bridgehead.getparent().remove(bridgehead) # Copy over remaining items in to new
        for child in element.getchildren(): element.remove(child) new_section.append(child) # Add new
        as next sibling of parent and remove from parent parent.addnext(new_section) parent.remove(element) # print 'DEBUG: New tree...' # print_tree(grandparent, 0, 3) sys.stderr.write( '**Information: removed and promoted as
        with title: ' + title + '\n' ) elif 'anchor' in element.tag: # Get key elements around this one parent = element.getparent() # Retrieve anchor details key = element.attrib.keys()[0] value = element.attrib[ key ] # Remove node parent.remove( element ); sys.stderr.write( '**Information: removed with id: ' + value + '\n' ) elif 'section' in element.tag: #Ensure at least one child beyond if element.__len__() == 1: title = element.__getitem__(0).text parent = element.getparent() # Make and add empty paragraph to section, just behind title new_para = parent.makeelement('para') new_para.text = ' ' element.append(new_para) sys.stderr.write( '**Information: <para> tag added to empty section with title: ' + title + '\n' ) for child in element.getchildren(): traverse_clean_other(child) def cleanup_xml(infile, outfile): # Create internal representation of document from infile parser = etree.XMLParser(remove_comments=False) tree = etree.parse(infile, parser=parser) head = tree.getroot() # print_tree( head, 0, 2 ) # Note: because link cleanup involves relative location of multiple tags, it must be separate and first traverse_clean_links(head) traverse_clean_other(head) # Persist updates to output file tree.write(outfile) def print_tree(element, level, max_depth): # Print current element num_children = element.__len__() indent = ' '.ljust(level+1) if level < max_depth: print indent, 'Tag: ', element.tag, ' Attrib: ', element.attrib, ' Text: >', element.text, '< Num children: ', num_children for i in range(num_children): child = element.__getitem__(i) print_tree(child, level+1, max_depth) def traverse_clean_sections(element): section_blacklist = ['Navigation', 'Table Of Contents'] # Walk children looking for next set of <section> tags, opening include files if necessary num_children = element.__len__() i = 0; while i < num_children: child = element.__getitem__(i) parent = element # print 'DEBUG: clean sections, visiting node with tag: ' + child.tag # Walk first level of tags, deleting info and any "blacklist" sections if 'section' in child.tag: num_sec_children = child.__len__() title = '' if num_sec_children > 0: first_grandchild = child.__getitem__(0) if first_grandchild.__len__() == 0: title = child.__getitem__(0).text else: # This makes me nervous, not sure how well it will work... title = first_grandchild.__getitem__(0).text # print 'Section title found: ' + title if title in section_blacklist: # Delete section # print 'DEBUG: Deleted blacklist section ' + title parent.remove(child) num_children = num_children-1 else: traverse_clean_sections(child) i = i+1 else: i=i+1 def eliminate_top_section(head): # Remove <info> and <index> sections for child in head.getchildren(): if 'info' in child.tag or 'index' in child.tag: # print 'DEBUG: unneeded top level tag: ' + child.tag head.remove(child) # Eliminate head section which really is title if head.__len__() == 1: first_section = head.__getitem__(0) if not 'section' in first_section.tag: print 'Error: Bad assumption. Top tag in document is not a section.' sys.exit(-36) # print 'DEBUG: first section -- tag: ' + first_section.tag + ' num children: ' + str(first_section.__len__()) for child in first_section.getchildren(): # print 'DEBUG: child -- tag: ' + child.tag + ' num children: ' + str(child.__len__()) # Promote sections if 'section' in child.tag: first_section.remove(child); head.append(child); # print 'DEBUG: Promoting child -- tag: ' + child.tag head.remove(first_section) else: print 'Error: Bad assumption. Too many sections (' + str(head.__len__()) + ') found in base document.' sys.exit(-13) def transform_head_sections(head): num_chapter = 0 for child in head.getchildren(): if 'section' in child.tag: child.tag = child.tag.replace('section','chapter') num_chapter = num_chapter+1 if num_chapter == 0: print 'Error: No chapters found in document' sys.exit(-6) def convert_structure(infile, outfile): # Create internal representation of document from infile parser = etree.XMLParser(remove_comments=False) tree = etree.parse(infile, parser=parser) head = tree.getroot() # print 'DEBUG: Pre tree structure cleanup...' # print_tree(head, 0, 3) if 'article' in head.tag: head.tag = 'book' # Clear attributes for attrib in head.attrib.keys(): head.attrib.pop(attrib, None) if head.attrib.items() != []: print 'Error: Section attributes not removed. ', head.attrib.items(), ' items remain -- ', head.attrib.keys() sys.exit(-5) else: print 'Toc file contains ', head.tag, 'tag, not <article>' sys.exit(-4) # Traverse tree sections, removing nodes as needed traverse_clean_sections(head) # Eliminate first section, placeholder for document title eliminate_top_section(head) # Traverse remaining top level <section> and convert to <chapter> transform_head_sections(head) # print 'DEBUG: Post tree structure cleanup...' # print_tree(head, 0, 2) # Persist updates to output file tree.write(outfile) def remove_book_tags(old_file, new_file): with open(old_file, 'r') as input: with open(new_file, 'wb') as output: for line in input: if '<book' not in line and '</book>' not in line: output.write(line) def insert_toc_into_book(toc_file, book_file): book_file_bak = book_file+'.bak' shutil.copy2(book_file, book_file_bak) key_string = '<!--TBD-->' inserted_toc = False with open(book_file_bak, 'r') as input: with open(book_file, 'wb') as output: for line in input: if key_string not in line: output.write(line) else: inserted_toc = True # Write toc_file contents with open(toc_file, 'r') as input_toc: for line_toc in input_toc: output.write(line_toc) if not inserted_toc: print 'Error: key string of "', key_string, '" not found in ', book_file sys.exit(-7) def build_revhistory(book_file): # Variables for formating git log log_format = '%h%x01%an%x01%ad%x01%s%x02' log_fields = ['id', 'author', 'date', 'subject'] # Retrieve log pipe = Popen('git log --date=iso --format="%s" -- . .' % log_format, shell=True, stdout=PIPE) log, _ = pipe.communicate() # Substitute for problem characters: &, <, > log = log.replace('&','&').replace('<','<').replace('>','>') # Remove newlines, trailing end-of-record (0x02), and then split at end-of-record log = log.replace('\n','').strip('\x02').split('\x02') # Split records into individual fields log = [row.split('\01') for row in log] # Create dictionary using field names log = [dict(zip(log_fields, row)) for row in log] # Format log into revision history revision = '<revhistory>\n' for entry in log: revision = revision + '<revision><date>' + entry['date'].split(' ')[0] + '</date><revdescription><para>' +\ entry['subject'] + ' (' + entry['id'] + ')</para></revdescription></revision>\n' revision = revision + '</revhistory>\n' # Update file rev_str = '<revhistory>TBD</revhistory>' update_file(book_file, rev_str, revision) def main(argv): master_git_url = 'https://github.com/OpenPOWERFoundation/Docs-Master.git' template_git_url = 'https://github.com/OpenPOWERFoundation/Docs-Template.git' html_dir = '' build_dir = '' db_dir = '' master_dir = '' template_dir = '' toc_file = master_doc+'.xml' try: opts, args = getopt.getopt(argv,"hs:b:d:m:t:",["htmldir","builddir=","docbookdir=","masterdir=","templatedir="]) except getopt.GetoptError: print 'Invalid option specified. Usage:' print ' opf_html2db.py -s <htmldir> -b <builddir> -d <docbookdir> -m <masterdir> -t <templatedir>' sys.exit(-1) for opt, arg in opts: if opt == '-h': print 'opf_hmtl2db.py -s <htmldir> -b <builddir> -d <docbookdir> -m <masterdir> -t <templatedir>' sys.exit(0) elif opt in ("-s", "--htmldir"): html_dir = arg elif opt in ("-b", "--builddir"): build_dir = arg elif opt in ("-d", "--docbookdir"): db_dir = arg elif opt in ("-m", "--masterdir"): master_dir = arg elif opt in ("-t", "--templatedir"): template_dir = arg # Verify html directory, error if not found if not os.path.exists(html_dir): print 'ERROR: ' + html_dir + ' does not exist. Please specify path to directory containing single html file.' sys.exit(-11) # Generate path to single file # NOTE: assumption is that file name is always "index.html" (master_doc). If this doesn't prove true, may need to use variable. html_file_src = os.path.join(html_dir, master_doc + '.html') if not os.path.isfile(html_file_src): print 'ERROR: ' + html_file_src + ' does not exist. Please verify path to single html file and file name.' sys.exit(-12) # Convert html file to xml and place in db directory if not os.path.exists(db_dir): print 'Making docbook build directory ' + db_dir os.path.makedirs(db_dir) db_file = os.path.join(db_dir, project + '.xml') if os.path.exists(db_file): os.remove(db_file) # Clean up herold html output print 'Cleaning up html file before processing' html_file = os.path.join(db_dir, master_doc + '.html') html_file_tmp1 = html_file + '.tmp1' shutil.copy2(html_file_src, html_file) cleanup_html(html_file, html_file_tmp1) print 'Converting html file to XML...' print subprocess.check_output(['herold', '-i', html_file_tmp1, '-o', db_file]) # Clone a new Master Directory print 'Cloning new Docs-Master directory...' if os.path.exists(master_dir): shutil.rmtree(master_dir) Repo.clone_from(master_git_url, master_dir) # Clone a new Template Directory print 'Cloning new Docs-Template directory...' if os.path.exists(template_dir): shutil.rmtree(template_dir) Repo.clone_from(template_git_url, template_dir) # Create the new XML file ***** rst_template_dir = os.path.join(template_dir, 'rst_template') full_toc_file = os.path.join(rst_template_dir, toc_file) shutil.copy2(db_file, full_toc_file) book_file = os.path.join(rst_template_dir, 'bk_main.xml') # Update all file in opf_docbook_settings with tag/value combinations specified print 'Updating Docbook files with settings from conf.py...' for f in opf_docbook_settings.keys(): filename = os.path.join(rst_template_dir, f) tags = opf_docbook_settings[f] for tag in tags: value = opf_docbook_settings[f][tag] if value != '': new_str = '<'+tag+'>'+value+'</'+tag+'>' else: new_str = '' old_str = '<'+tag+'>TBD</'+tag+'>' update_file(filename, old_str, new_str) # Parse TOC file, convert high level tag to "book" and write back out to .tmp1 file print 'Cleaning up Docbook file structure...' full_toc_file_tmp1 = full_toc_file+'.tmp1' full_toc_file_tmp2 = full_toc_file+'.tmp2' full_toc_file_tmp3 = full_toc_file+'.tmp3' # Walk document correcting XML errors cleanup_xml( full_toc_file, full_toc_file_tmp1 ) # Remove extraneous sections convert_structure( full_toc_file_tmp1, full_toc_file_tmp2 ) # Eliminate <book> and <title> tags in .tmp1 and write to .tmp2 file remove_book_tags(full_toc_file_tmp2, full_toc_file_tmp3) # Update link to first file insert_toc_into_book(full_toc_file_tmp3, book_file) # Create revision history from Git Log print 'Building document revision history from git log...' build_revhistory(book_file) # TODO: Remove this hack after rst_template bk_main gets updated update_file(book_file, 'xmlns:xlink', 'xmlns:xl') # Perform build of Docbook print 'Building Docbook PDF and HTML output in Maven...' maven_log_file = 'build.log' maven_build = 'cd ' + rst_template_dir + '; mvn generate-sources 2>&1 | tee ' + maven_log_file + '' pipe = Popen(maven_build, shell=True) log, err = pipe.communicate() if pipe.returncode != 0: print "Build failed with return code:%s" % pipe.returncode print "See %s/build.log for more details" & rst_template_dir # Copy output to better location print 'Copying build output...' bld_out_dir = os.path.join(rst_template_dir, 'target/docbkx/webhelp') html_head = os.path.join(bld_out_dir, opf_docbook_settings['pom.xml']['webhelpDirname'] + '/index.html') if os.path.exists(bld_out_dir) and os.path.exists(html_head): doc_dir = os.path.join(build_dir, 'docbook/opf_docbook') if os.path.exists(doc_dir): shutil.rmtree(doc_dir) shutil.copytree(bld_out_dir, doc_dir) print "Build successful. Output files located in %s" % os.path.join(doc_dir, opf_docbook_settings['pom.xml']['webhelpDirname']) sys.exit(0) else: print "Docbook build failed. Check logfile %s for details." % os.path.join(rst_template_dir, maven_log_file) sys.exit(-10) if __name__ == "__main__": main(sys.argv[1:])