Second pass at supporting RST->DB conversions:

- Minor update to rst_template/bk_main.xml to use namespace reference
  "xl" instead of "xlink" to be compatible with herold output
- Create new script to convert singlehtml output of projects
  using herold and then massage into OPF format for inclusion in rst_template

Signed-off-by: Jeff Scheel <>
Jeff Scheel 8 years ago
parent 51954d8265
commit 0751965d7f

@ -27,7 +27,7 @@

<book xmlns=""

@ -0,0 +1,692 @@
# -*- coding: utf-8 -*-
# Builds OpenPOWER Foundation documentation using standard template.
# Assumes rst2db has been used to convert rst to docbook.
import os, sys, getopt, shutil, errno, subprocess, copy, re
from os import fdopen, remove
from shutil import move
from git import Repo
from lxml import etree
from conf import opf_docbook_settings, master_doc, project
from subprocess import Popen, PIPE

def copy_xml_to_template(src_dir, tgt_dir):
# Copy XML files
src_files = os.listdir(src_dir)
for filename in src_files:
full_file = os.path.join (src_dir, filename)
if (os.path.isfile(full_file)):
shutil.copy(full_file, tgt_dir)
elif (os.path.isdir(full_file)):
except OSError as exception:
if exception.errno != errno.EEXIST:
copy_xml_to_template( os.path.join(src_dir,filename), os.path.join(tgt_dir,filename) )

def update_file(filename, old_str, new_str):
# Verify tag exists
with open(filename) as f:
s =
if old_str not in s:
print 'Error: "{old_str}" not found in {filename}.'.format(**locals())

# Safely write the changed content, if found in the file
with open(filename, 'w') as f:
s = s.replace(old_str, new_str)

def traverse_clean_html_source_examples(filename):
temp_file = filename + '.tmp'
code_found = False
html_source_start_regex = '^<div class="highlight-default"><div class="highlight"><pre>'
html_source_stop_regex = '^</pre></div>'
span_regex = '\<span(\sclass="[a-z]+")?>'

print filename
# Walk file by line
with open(temp_file, 'w') as new_file:
with open(filename) as old_file:
for line in old_file:
if re.match(html_source_start_regex,line):
# print 'DEBUG: Code block start found'
code_found = True
elif re.match(html_source_stop_regex,line):
# print 'DEBUG: Code block stop found'
code_found = False

if code_found:
oldline = line
# Remove </span> references
line = line.replace('</span>', '')
# Remove <span class=...> references
line = re.sub(span_regex, '', line)
# print 'DEBUG: line changed.\n Old: >' + oldline + '<\n New: >' + line + '<'

# Preserve old file
move(filename, filename + '.bak')
# Move new file into old
move(temp_file, filename)

def traverse_clean_html_nodes(element):

if 'ul' in element.tag and element.attrib:
key = element.attrib.keys()[0]
value = element.attrib[key]
if 'id' in key:
first_child = element.__getitem__(0);
if first_child.__len__() == 0:
print 'Error: Bad assumption. <ul> tag is empty.'
# Add attribute to first_child and remove from element
first_child.attrib[ key ] = value;
del element.attrib[ key ]
# print 'DEBUG: <ul> attributes: ', element.attrib
# print 'DEBUG: child attributes: ', first_child.attrib
sys.stderr.write( '**Information: id attribute on <ul> tag to first sub-element, <' + element.tag + '> for ' + key + ' = ' + value + '\n' )
for child in element.getchildren():

def cleanup_html(infile, outfile):
# Create internal representation of document from infile
parser = etree.XMLParser(remove_comments=False)
tree = etree.parse(infile, parser=parser)
head = tree.getroot()

# print_tree( head, 0, 2 )

# Walk nodes doing any cleanup

# Persist updates to output file
# Note: This invocation needs to occur post tree-write because
# it will update file
def find_match(reference, anchor_node, relationship):

if not anchor_node is None and 'anchor' in anchor_node.tag:
# Try this, verify matching ids
key = anchor_node.attrib.keys()[0]
value = anchor_node.attrib[key]
regex = '^' + reference + '(\.\d+)?$'

# print 'DEBUG: ' + relationship + ' anchor check. Reference: ' + reference + ' Regex: ' + regex + ' Value: ' + value

if re.match(regex,value):
return anchor_node

# print 'DEBUG: Anchor in ' + relationship + ' tag does not match. Expected: ', reference, ' Found: ', value, ' Looking further...'
node = anchor_node
while not node.getprevious() is None:
node = node.getprevious()
if 'anchor' in node.tag:
key = node.attrib.keys()[0]
value = node.attrib[ key ]
if re.match(regex,value):
# print 'DEBUG: Anchor in ' + relationship + ' tag finally match!!!'
return node
# else
# print 'DEBUG: Anchor in ' + relationship + ' tag does not match. Expected: ', reference, ' Found: ', value, ' Looking further...'

# print 'DEBUG: Anchor in ' + relationship + ' tag does not match. Expected: ', reference, ' Found: ', value, ' Anchor node: ', node
return None

# print 'Error: find_match called with non-anchor element. Reference: ' + reference + ' Node: ' + anchor_node + ' Relationship: ' + relationship
return None

def traverse_clean_links(element):

if 'link' in element.tag:
# Note: Terminal tag, no need to recurse
# Gather link details
text = element.text
num_attributes = element.attrib.__len__()
reference = element.attrib.get('linkend',None)
if num_attributes is 1 and not reference is None and text == u'¶':
# Erroneous link message, find related anchor, could be "uncle" or "cousin" (of various degrees)
anchor = None
parent = element.getparent()
grandparent = parent.getparent()
greatuncle = grandparent.getprevious()
# Check Great Uncle for match
anchor = find_match(reference, greatuncle, 'Great Uncle')
# If no match, locate "cousin" and if found, check it
if anchor is None:
cousin = None
if not greatuncle is None:
node = greatuncle
while node.__len__() > 0 and cousin is None:
node = node.__getitem__(node.__len__() -1)
if 'anchor' in node.tag:
cousin = node
if not cousin is None:
anchor = find_match(reference, cousin, 'Cousin')
# If no match, try uncle
if anchor is None:
uncle = parent.getprevious()
anchor = find_match(reference, uncle, 'Uncle')
# Always delete <link> tag of this type (contains only u'¶' for text)
if not anchor is None:
# print 'MATCH FOUND: ', reference

# Retrieve attribute key from anchor
# Note: The <link> key is always correctly set by herold in the case of duplicate keys.
# The <anchor> tag may have a "dot" and a number appended to value in <link>.
key = anchor.attrib.keys()[0]
value = anchor.get(key)
if 'title' in parent.tag:
# Add id attribute to Grandparent
# Add id attribute to Parent
sys.stderr.write( '**Information: removed dummy link and for ' + reference + ' and added proper xml:id as ' + value + '\n' )
# Delete <anchor> tag
anchor_parent = anchor.getparent()
# Nothing more to do
sys.stderr.write( '**Information: Matching <anchor> element not found for reference = ' + reference + '. Link removed.' + '\n' )
for child in element.getchildren():

def traverse_clean_other(element):
if 'informalexample' in element.tag:
# Get key elements around this one
parent = element.getparent()
grandparent = parent.getparent()

# Create new elements -- section and title (use text from informal example element)
new_section = parent.makeelement(grandparent.tag)
new_title = parent.makeelement('title')
title = element.text
new_title.text = title

# Add title to new section

# Copy over children from <informalexample> to new <section>
for child in element.getchildren():

# print 'DEBUG: old tree...'
# print_tree(parent, 0, 2)

# Add new <section> as next sibling of parent and remove <informalexample> from parent

# print 'DEBUG: new tree...'
# print_tree(parent.getparent(), 0, 3)
sys.stderr.write( '**Information: <informalexample> ' + element.text + ' removed and promoted as <section> with title: ' + title + '\n' )

elif 'note' in element.tag:
# Get key elements around this one
parent = element.getparent()
grandparent = parent.getparent()

# print 'DEBUG: old tree...'
# print_tree(parent, 0, 4)

# Create new elements -- section and title (use text from bridgehead subelement)
new_section = parent.makeelement(parent.tag)
bridgehead = element.__getitem__(0).__getitem__(0)
if not 'bridgehead' in bridgehead.tag:
print 'Error: Bad assumption about <note> structure. Bridgehead not found as expected.'
title = bridgehead.text
new_title = parent.makeelement('title')
new_title.text = title

# Add title to new section
# Remove <bridgehead> from <note>
# Copy over remaining items in <note> to new <section>
for child in element.getchildren():
# Add new <section> as next sibling of parent and remove <note> from parent

# print 'DEBUG: New tree...'
# print_tree(grandparent, 0, 3)
sys.stderr.write( '**Information: <note> removed and promoted as <section> with title: ' + title + '\n' )

elif 'anchor' in element.tag:
# Get key elements around this one
parent = element.getparent()

# Retrieve anchor details
key = element.attrib.keys()[0]
value = element.attrib[ key ]

# Remove node
parent.remove( element );

sys.stderr.write( '**Information: removed <anchor> with id: ' + value + '\n' )
elif 'section' in element.tag:
#Ensure at least one child beyond <title>
if element.__len__() == 1:
title = element.__getitem__(0).text
parent = element.getparent()

# Make and add empty paragraph to section, just behind title
new_para = parent.makeelement('para')
new_para.text = '&nbsp;'
sys.stderr.write( '**Information: <para> tag added to empty section with title: ' + title + '\n' )

for child in element.getchildren():

def cleanup_xml(infile, outfile):
# Create internal representation of document from infile
parser = etree.XMLParser(remove_comments=False)
tree = etree.parse(infile, parser=parser)
head = tree.getroot()

# print_tree( head, 0, 2 )

# Note: because link cleanup involves relative location of multiple tags, it must be separate and first

# Persist updates to output file
def print_tree(element, level, max_depth):
# Print current element
num_children = element.__len__()
indent = ' '.ljust(level+1)
if level < max_depth:
print indent, 'Tag: ', element.tag, ' Attrib: ', element.attrib, ' Text: >', element.text, '< Num children: ', num_children
for i in range(num_children):
child = element.__getitem__(i)
print_tree(child, level+1, max_depth)

def traverse_clean_sections(element):
section_blacklist = ['Navigation', 'Table Of Contents']

# Walk children looking for next set of <section> tags, opening include files if necessary
num_children = element.__len__()
i = 0;
while i < num_children:
child = element.__getitem__(i)
parent = element
# print 'DEBUG: clean sections, visiting node with tag: ' + child.tag
# Walk first level of tags, deleting info and any "blacklist" sections
if 'section' in child.tag:
num_sec_children = child.__len__()
title = ''
if num_sec_children > 0:
first_grandchild = child.__getitem__(0)
if first_grandchild.__len__() == 0:
title = child.__getitem__(0).text
# This makes me nervous, not sure how well it will work...
title = first_grandchild.__getitem__(0).text
# print 'Section title found: ' + title
if title in section_blacklist:
# Delete section
# print 'DEBUG: Deleted blacklist section ' + title
num_children = num_children-1
i = i+1
def eliminate_top_section(head):

# Remove <info> and <index> sections
for child in head.getchildren():
if 'info' in child.tag or 'index' in child.tag:
# print 'DEBUG: unneeded top level tag: ' + child.tag
# Eliminate head section which really is title
if head.__len__() == 1:
first_section = head.__getitem__(0)
if not 'section' in first_section.tag:
print 'Error: Bad assumption. Top tag in document is not a section.'
# print 'DEBUG: first section -- tag: ' + first_section.tag + ' num children: ' + str(first_section.__len__())
for child in first_section.getchildren():
# print 'DEBUG: child -- tag: ' + child.tag + ' num children: ' + str(child.__len__())
# Promote sections
if 'section' in child.tag:
# print 'DEBUG: Promoting child -- tag: ' + child.tag

print 'Error: Bad assumption. Too many sections (' + str(head.__len__()) + ') found in base document.'

def transform_head_sections(head):

num_chapter = 0
for child in head.getchildren():
if 'section' in child.tag:
child.tag = child.tag.replace('section','chapter')
num_chapter = num_chapter+1

if num_chapter == 0:
print 'Error: No chapters found in document'

def convert_structure(infile, outfile):

# Create internal representation of document from infile
parser = etree.XMLParser(remove_comments=False)
tree = etree.parse(infile, parser=parser)
head = tree.getroot()

# print 'DEBUG: Pre tree structure cleanup...'
# print_tree(head, 0, 3)

if 'article' in head.tag:
head.tag = 'book'
# Clear attributes
for attrib in head.attrib.keys():
head.attrib.pop(attrib, None)
if head.attrib.items() != []:
print 'Error: Section attributes not removed. ', head.attrib.items(), ' items remain -- ', head.attrib.keys()
print 'Toc file contains ', head.tag, 'tag, not <article>'

# Traverse tree sections, removing nodes as needed

# Eliminate first section, placeholder for document title
# Traverse remaining top level <section> and convert to <chapter>

# print 'DEBUG: Post tree structure cleanup...'
# print_tree(head, 0, 2)
# Persist updates to output file

def remove_book_tags(old_file, new_file):
with open(old_file, 'r') as input:
with open(new_file, 'wb') as output:
for line in input:
if '<book' not in line and '</book>' not in line:

def insert_toc_into_book(toc_file, book_file):
book_file_bak = book_file+'.bak'
shutil.copy2(book_file, book_file_bak)
key_string = '<!--TBD-->'
inserted_toc = False

with open(book_file_bak, 'r') as input:
with open(book_file, 'wb') as output:
for line in input:
if key_string not in line:
inserted_toc = True
# Write toc_file contents
with open(toc_file, 'r') as input_toc:
for line_toc in input_toc:
if not inserted_toc:
print 'Error: key string of "', key_string, '" not found in ', book_file

def build_revhistory(book_file):
# Variables for formating git log
log_format = '%h%x01%an%x01%ad%x01%s%x02'
log_fields = ['id', 'author', 'date', 'subject']

# Retrieve log
pipe = Popen('git log --date=iso --format="%s" -- . .' % log_format, shell=True, stdout=PIPE)
log, _ = pipe.communicate()
# Substitute for problem characters: &, <, >
log = log.replace('&','&amp;').replace('<','&lt;').replace('>','&gt;')
# Remove newlines, trailing end-of-record (0x02), and then split at end-of-record
log = log.replace('\n','').strip('\x02').split('\x02')
# Split records into individual fields
log = [row.split('\01') for row in log]
# Create dictionary using field names
log = [dict(zip(log_fields, row)) for row in log]

# Format log into revision history
revision = '<revhistory>\n'
for entry in log:
revision = revision + '<revision><date>' + entry['date'].split(' ')[0] + '</date><revdescription><para>' +\
entry['subject'] + ' (' + entry['id'] + ')</para></revdescription></revision>\n'
revision = revision + '</revhistory>\n'

# Update file
rev_str = '<revhistory>TBD</revhistory>'
update_file(book_file, rev_str, revision)

def main(argv):
master_git_url = ''
template_git_url = ''
html_dir = ''
build_dir = ''
db_dir = ''
master_dir = ''
template_dir = ''
toc_file = master_doc+'.xml'

opts, args = getopt.getopt(argv,"hs:b:d:m:t:",["htmldir","builddir=","docbookdir=","masterdir=","templatedir="])
except getopt.GetoptError:
print 'Invalid option specified. Usage:'
print ' -s <htmldir> -b <builddir> -d <docbookdir> -m <masterdir> -t <templatedir>'
for opt, arg in opts:
if opt == '-h':
print ' -s <htmldir> -b <builddir> -d <docbookdir> -m <masterdir> -t <templatedir>'
elif opt in ("-s", "--htmldir"):
html_dir = arg
elif opt in ("-b", "--builddir"):
build_dir = arg
elif opt in ("-d", "--docbookdir"):
db_dir = arg
elif opt in ("-m", "--masterdir"):
master_dir = arg
elif opt in ("-t", "--templatedir"):
template_dir = arg

# Verify html directory, error if not found
if not os.path.exists(html_dir):
print 'ERROR: ' + html_dir + ' does not exist. Please specify path to directory containing single html file.'

# Generate path to single file
# NOTE: assumption is that file name is always "index.html" (master_doc). If this doesn't prove true, may need to use variable.
html_file_src = os.path.join(html_dir, master_doc + '.html')

if not os.path.isfile(html_file_src):
print 'ERROR: ' + html_file_src + ' does not exist. Please verify path to single html file and file name.'

# Convert html file to xml and place in db directory
if not os.path.exists(db_dir):
print 'Making docbook build directory ' + db_dir

db_file = os.path.join(db_dir, project + '.xml')
if os.path.exists(db_file):

# Clean up herold html output
print 'Cleaning up html file before processing'
html_file = os.path.join(db_dir, master_doc + '.html')
html_file_tmp1 = html_file + '.tmp1'
shutil.copy2(html_file_src, html_file)
cleanup_html(html_file, html_file_tmp1)

print 'Converting html file to XML...'
print subprocess.check_output(['herold', '-i', html_file_tmp1, '-o', db_file])
# Clone a new Master Directory
print 'Cloning new Docs-Master directory...'
if os.path.exists(master_dir):
Repo.clone_from(master_git_url, master_dir)
# Clone a new Template Directory
print 'Cloning new Docs-Template directory...'
if os.path.exists(template_dir):
Repo.clone_from(template_git_url, template_dir)
# Create the new XML file *****
rst_template_dir = os.path.join(template_dir, 'rst_template')
full_toc_file = os.path.join(rst_template_dir, toc_file)
shutil.copy2(db_file, full_toc_file)
book_file = os.path.join(rst_template_dir, 'bk_main.xml')
# Update all file in opf_docbook_settings with tag/value combinations specified
print 'Updating Docbook files with settings from'
for f in opf_docbook_settings.keys():
filename = os.path.join(rst_template_dir, f)
tags = opf_docbook_settings[f]

for tag in tags:
value = opf_docbook_settings[f][tag]
if value != '':
new_str = '<'+tag+'>'+value+'</'+tag+'>'
new_str = ''

old_str = '<'+tag+'>TBD</'+tag+'>'
update_file(filename, old_str, new_str)
# Parse TOC file, convert high level tag to "book" and write back out to .tmp1 file
print 'Cleaning up Docbook file structure...'
full_toc_file_tmp1 = full_toc_file+'.tmp1'
full_toc_file_tmp2 = full_toc_file+'.tmp2'
full_toc_file_tmp3 = full_toc_file+'.tmp3'

# Walk document correcting XML errors
cleanup_xml( full_toc_file, full_toc_file_tmp1 )
# Remove extraneous sections
convert_structure( full_toc_file_tmp1, full_toc_file_tmp2 )
# Eliminate <book> and <title> tags in .tmp1 and write to .tmp2 file
remove_book_tags(full_toc_file_tmp2, full_toc_file_tmp3)

# Update link to first file
insert_toc_into_book(full_toc_file_tmp3, book_file)
# Create revision history from Git Log
print 'Building document revision history from git log...'

# TODO: Remove this hack after rst_template bk_main gets updated
update_file(book_file, 'xmlns:xlink', 'xmlns:xl')
# Perform build of Docbook
print 'Building Docbook PDF and HTML output in Maven...'
maven_log_file = 'build.log'
maven_build = 'cd ' + rst_template_dir + '; mvn generate-sources 2>&1 | tee ' + maven_log_file + ''
pipe = Popen(maven_build, shell=True)
log, err = pipe.communicate()
if pipe.returncode != 0:
print "Build failed with return code:%s" % pipe.returncode
print "See %s/build.log for more details" & rst_template_dir
# Copy output to better location
print 'Copying build output...'
bld_out_dir = os.path.join(rst_template_dir, 'target/docbkx/webhelp')
html_head = os.path.join(bld_out_dir, opf_docbook_settings['pom.xml']['webhelpDirname'] + '/index.html')
if os.path.exists(bld_out_dir) and os.path.exists(html_head):
doc_dir = os.path.join(build_dir, 'docbook/opf_docbook')
if os.path.exists(doc_dir):
shutil.copytree(bld_out_dir, doc_dir)
print "Build successful. Output files located in %s" % os.path.join(doc_dir, opf_docbook_settings['pom.xml']['webhelpDirname'])

print "Docbook build failed. Check logfile %s for details." % os.path.join(rst_template_dir, maven_log_file)

if __name__ == "__main__":