import re, string

"""
xmlcolor - module for colorizing xml output
written by Maciej "Fiedzia" Dziardziel (fiedzia@fiedzia.prv.pl, jid: fiedzia@chrome.pl)

based on jabber.py(colorize) and snippet submitted by Paul Prescod 
(http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/65125)


USAGE:
import xmlcolor
print xmlcolor.color('<xmlchunk> plaintext </xmlchunk>')
print xmlcolor.color('<tag attr="name"> \n some \n text <one_more_tag/> \n </tag>', infostr='', infocolor='')



python xmlcolor.py file.xml
python xmlcolor.py --infostr=comment --infocolor=red file.xml


If you run this module directly, it will print xml file given as argument
possible commandline options are:
  --basic
  --infostr infostr
  --infocolor infocolor

TODO:

getopt
better tokenizer with attribute support

"""


class recollector:
    def __init__(self):
        self.res={}
    def add(self, name, reg ):
        re.compile(reg) # check that it is valid

        self.res[name] = reg % self.res
        
collector = recollector()
a = collector.add

a("TextSE" , "[^<]+")
a("UntilHyphen" , "[^-]*-")
a("Until2Hyphens" , "%(UntilHyphen)s(?:[^-]%(UntilHyphen)s)*-")
a("CommentCE" , "%(Until2Hyphens)s>?") 
a("UntilRSBs" , "[^\\]]*](?:[^\\]]+])*]+")
a("CDATA_CE" , "%(UntilRSBs)s(?:[^\\]>]%(UntilRSBs)s)*>" )
a("S" , "[ \\n\\t\\r]+")
a("NameStrt" , "[A-Za-z_:]|[^\\x00-\\x7F]")
a("NameChar" , "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]")
a("Name" , "(?:%(NameStrt)s)(?:%(NameChar)s)*")
a("QuoteSE" , "\"[^\"]*\"|'[^']*'")
a("DT_IdentSE" , "%(S)s%(Name)s(?:%(S)s(?:%(Name)s|%(QuoteSE)s))*" )
a("MarkupDeclCE" , "(?:[^\\]\"'><]+|%(QuoteSE)s)*>" )
a("S1" , "[\\n\\r\\t ]")
a("UntilQMs" , "[^?]*\\?+")
a("PI_Tail" , "\\?>|%(S1)s%(UntilQMs)s(?:[^>?]%(UntilQMs)s)*>" )
a("DT_ItemSE" ,
    "<(?:!(?:--%(Until2Hyphens)s>|[^-]%(MarkupDeclCE)s)|\\?%(Name)s(?:%(PI_Tail)s))|%%%(Name)s;|%(S)s"
)
a("DocTypeCE" ,
"%(DT_IdentSE)s(?:%(S)s)?(?:\\[(?:%(DT_ItemSE)s)*](?:%(S)s)?)?>?" )
a("DeclCE" ,
    "--(?:%(CommentCE)s)?|\\[CDATA\\[(?:%(CDATA_CE)s)?|DOCTYPE(?:%(DocTypeCE)s)?")
a("PI_CE" , "%(Name)s(?:%(PI_Tail)s)?")
a("EndTagCE" , "%(Name)s(?:%(S)s)?>?")
a("AttValSE" , "\"[^<\"]*\"|'[^<']*'")
a("ElemTagCE" ,
    "%(Name)s(?:%(S)s%(Name)s(?:%(S)s)?=(?:%(S)s)?(?:%(AttValSE)s))*(?:%(S)s)?/?>?")

a("MarkupSPE" ,
    "<(?:!(?:%(DeclCE)s)?|\\?(?:%(PI_CE)s)?|/(?:%(EndTagCE)s)?|(?:%(ElemTagCE)s)?)")
a("XML_SPE" , "%(TextSE)s|%(MarkupSPE)s")
a("XML_MARKUP_ONLY_SPE" , "%(MarkupSPE)s")


def lexxml(data, markuponly=0):
    """ return list of tokens """
    if markuponly:
        reg = "XML_MARKUP_ONLY_SPE"
    else:
        reg = "XML_SPE"
    regex = re.compile(collector.res[reg])
    return regex.findall(data)


def colorize(txt, col):
    """Return colorized text. Accepts color name or number"""
    #if type(txt)==type(u''): txt=txt.encode(jabber.xmlstream.ENCODING,'replace')
    cols = {'black':0, 'red':1, 'green':2, 'yellow':3, 'blue':4, 'magenta':5, 'cyan':6, 'white':7}
    initcode = '\033[;3'
    endcode = '\033[0m'
    if type(col) == type(1): 
        return initcode + str(col) + 'm' + txt + endcode
    try: return initcode + str(cols[col]) + 'm' + txt + endcode
    except: return txt


BINDINGS = {'unknown':'cyan', 'declaration':'yellow', 'xml_declaration':'yellow','processing_instruction':'blue','end_tag':'green','empty_tag':'red','start_tag':'green'}


def colortoken(token):
    """ color token according to BINDINGS """
    if token.startswith("<"):
        if token.startswith("<!"):
           return colorize(token, BINDINGS['declaration'])
        elif token.startswith("<?xml"):
           return colorize(token, BINDINGS['xml_declaration'])
        elif token.startswith("<?"):
	    return colorize(token, BINDINGS['xprocessing_instruction'])
        elif token.startswith("</"):
	    return colorize(token, BINDINGS['end_tag'])
        elif token.endswith("/>"):
	    return colorize(token, BINDINGS['empty_tag'])
        elif token.endswith(">"):
	    return colorize(token, BINDINGS['start_tag'])

    return colorize(token,BINDINGS['unknown'])




def color(xmlchunk, infocolor=None, infostr=None, basic=False):
    """ 
    return colorized text. xmlchunk is string containing xml,
    if infocolor and infostr are defined, colored infostr is added to every line of output
    if basic, then xmlchunk won't be colorized
    
    """
    if not basic:
        tokens = lexxml(xmlchunk)
        coloredtokens = []
        for token in tokens:
            coloredtokens.append(colortoken(token))
	lines = string.split( string.join(coloredtokens), '\n')
    else:
        lines = string.split( xmlchunk, '\n')

    output = []
    if infocolor and infostr:
        for line in lines:
            output.append(colorize(infostr, infocolor)+line)
	output = string.join(output,'\n')
    else:
       output = string.join(lines,'\n')
    return output



if __name__=="__main__":
    import sys
    mybasic = myinfostr = myinfocolor = None
    for arg in sys.argv:
        if arg.startswith('--basic'):   mybasic=True
	if arg.startswith('--infostr'): myinfostr = string.split(arg, '=')[1]
	if arg.startswith('--infocolor'): myinfocolor = string.split(arg, '=')[1]

    data = open(sys.argv[-1],'r').read()
    print color(data, basic=mybasic, infostr=myinfostr, infocolor=myinfocolor)
    
