Strip white characters among html tags in your pages with CherryPy
import re class StripWhiteChars(object): """ Stripping all "white characters" among html tags. Author: Jaroslaw Zabiello (http://zabiello.com) Version: 2004-11-26 """ omitTags = ['xmp', 'pre', 'script', 'style', 'textarea', 'code'] def __init__(self): # new object style: http://python.org/doc/newstyle.html super(StripWhiteChars, self).__init__() self.mark = [] for tag in self.omitTags: pattern = '<'+tag+r'[^>]*?>.*?</'+tag+'>' record = {'tag': tag, 'regex': re.compile(pattern, re.IGNORECASE|re.MULTILINE|re.DOTALL),} self.mark.append(record) self.regexStrip = re.compile(r'(<[^>]+?>)[\s]+?(<[^>]+?>)', re.MULTILINE|re.DOTALL) def strip(self, html): marked = {} for dic in self.mark: found = dic['regex'].findall(html) for i in xrange(len(found)): replaced = '@@@STRIPPED%s%d@@@' % (dic['tag'], i) html = html.replace(found[i], replaced) marked[replaced] = found[i] while self.regexStrip.search(html): html = self.regexStrip.sub(r'\1\2', html) for replaced, source in marked.iteritems(): html = html.replace(replaced, source) return html if __name__=='__main__': # test: html = ''' <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="pl"> <head> <title>To jest test</title> <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-2" /> <meta name="language" content="pl" /> <meta name="robots" content="all" /> <style type="text/css" media="all">@import "/styles/glowna.css";</style> <script type="text/javascript" src="/scripts/funkcje.js"> </script> </head> <body id> <div id="accessibility">goto [<a href="#menu">part one</a>] [<a href="#search">part two</a>]</div> <div id="container"> <script language="JavaScript"> <!-- document.write('<code> to jest test</code>') //--> </script> <code> To jest jakis kod </code> <script> <!-- document.write('to jest script2') //--> </script> <table border=0> <tr> <td> <A HREF="http://python.org"> <font color="red">Python HomePage</font> </A> </td> </tr> </table> </div> </body> </html> ''' test = StripWhiteChars() print test.strip(html)
It can be added to CherryPy as filter:
from cherrypy import cpg from cherrypy.lib.filter.basefilter import BaseOutputFilter class StripWhiteCharsFilter(BaseOutputFilter): """Filter stripping all white-chars among html tags""" def __init__(self): super(StripWhiteCharsFilter, self).__init__() self.obj = StripWhiteChars() def beforeResponse(self): cpg.response.body = [self.obj.strip(''.join(cpg.response.body))] testhtml = '<html>.....</html>' class Root(object): """main""" _cpFilterList = [StripWhiteCharsFilter()] def index(self, **kwargs): return testhtml index.exposed = True cpg.root = Root() cpg.server.start(configFile = 'Root.conf')

