#!/usr/bin/env python

"""Quick-and-dirty approximation of hibidi on XHTML.

This doesn't really perform the bidi algorithm at each level.
It only infers the base direction of each element and takes it
as default for the whole element.

This means that in::

    <foo>
        ltr
        <bar>??</bar>
        RTL
        <quux>??</quux>
        RTL
        <baz>??</baz>
        ltr
    </foo>

the neutral element quux will not be inferred as RTL although it should be
according to the spec.

I do intend to implement it fully, but for now this is enough for 98% of mixed
documents out there...

"""

import unicodedata
from xml.dom.minidom import parseString, Node, Text

def hibidi_unicode(u, root='html/body', encoding='utf-8'):
    """Takes an XML unicode string, returns a new one."""
    return hibidi_str(u.encode(encoding), root).decode(encoding)

def hibidi_str(s, root='html/body', encoding=None):
    """Takes an XML string, returns a new one."""
    doc = parseString(s)
    hibidi_dom(doc, root)
    return doc.toxml(encoding=encoding or doc.encoding)

def hibidi_dom(doc, root='html/body'):
    """Takes a dom object, mutates it in-place."""
    nodes = [doc]
    for name in root.split('/'):
        nodes = [n for node in nodes
                   for n in node.getElementsByTagName(name)]
    for node in nodes:
        infer_dirs(node)
        assign_dirs(node)
        apply_dirs(node)

def text_dir(c):
    """Classify a character as 'R'/'L'/''."""
    dir = unicodedata.bidirectional(c)
    if dir in ('L',):
        return 'L'
    if dir in ('R', 'AL'):
        return 'R'
    return ''

def infer_dirs(node):
    """Infer (store & return) dirs in bottom-up order."""
    if node.nodeType != Node.ELEMENT_NODE:
        return ''
    # recurse anyway - to infer dir of all elements
    dirs = map(infer_dirs, node.childNodes)
    # first strong dir will be returned.
    def gen_dirs():
        # explicit dir attr?
        try:
            attr = node.attributes['dir']
            yield {'rtl': 'R', 'ltr': 'L'}[attr.value.lower()]
        except KeyError:
            pass
        # directly contains strong text?
        for child in node.childNodes:
            # text nodes don't get their own dir - they are not elements
            if child.nodeType in (Node.TEXT_NODE,  Node.CDATA_SECTION_NODE):
                for c in child.nodeValue:
                    yield text_dir(c)
        # from child nodes
        for dir in dirs:
            yield dir
    for dir in gen_dirs():
        if dir:
            node.dir = dir
            #node.attributes['inferred_dir'] = dir #@@@
            return dir
    node.dir = ''
    return ''

def assign_dirs(node, base_dir=''):
    """Assign dirs to neutral nodes."""
    if node.nodeType != Node.ELEMENT_NODE:
        return
    if not node.dir:
        node.dir = base_dir
        #node.attributes['assigned_dir'] = base_dir #@@@
    for child in node.childNodes:
        assign_dirs(child, node.dir)

LRM = Text()
LRM.data = u'\N{LEFT-TO-RIGHT MARK}'
RLM = Text()
RLM.data = u'\N{RIGHT-TO-LEFT MARK}'

def apply_dirs(node, base_dir=''):
    """Create dir attributes where needed."""
    if node.nodeType != Node.ELEMENT_NODE:
        return
    dir = node.dir
    if dir is not None and dir != base_dir:
        node.attributes['dir'] = {'R': 'RTL', 'L': 'LTR'}[dir]

        # The following helps OpenOffice's HTML parser.  The proper solution
        # is to implement hibidi on OpenOffice documents after the conversion
        # (or fix the parser to use dir attributes), so I don't want this on
        # by default.
        ## mark = {'R': RLM, 'L': LRM}[dir]
        ## if not (node.childNodes[0].nodeType == Node.TEXT_NODE and
        ##         node.childNodes[0].data.startswith('\n')):
        ##     node.childNodes.insert(0, mark)
        ## if not (node.childNodes[-1].nodeType == Node.TEXT_NODE and
        ##         node.childNodes[-1].data.endswith('\n')):
        ##     node.childNodes.append(mark)
        ## node.attributes['lang'] = {'R': 'he', 'L': 'en'}[dir]
        ## node.attributes['align'] = {'R': 'right', 'L': 'left'}[dir]
    for child in node.childNodes:
        apply_dirs(child, dir)

if __name__ == '__main__':
    import sys
    sys.stdout.write(hibidi_str(sys.stdin.read()))