CherryPy Project Download

root/tidy.py

Revision 7 (checked in by fumanchu, 3 years ago)

Dropping the tidy and nsgmls Tools here after being ripped out of the CherryPy distro.

Line 
1 """Functions to run cherrypy.response through Tidy or NSGML."""
2
3 import cgi
4 import os
5 try:
6     from cStringIO import StringIO
7 except ImportError:
8     from StringIO import StringIO
9 import traceback
10
11 import cherrypy
12
13 def tidy(temp_dir, tidy_path, strict_xml=False, errors_to_ignore=None,
14          indent=False, wrap=False, warnings=True):
15     """Run cherrypy.response through Tidy.
16     
17     If either 'indent' or 'wrap' are specified, then response.body will be
18     set to the output of tidy. Otherwise, only errors (including warnings,
19     if warnings is True) will change the body.
20     
21     Note that we use the standalone Tidy tool rather than the python
22     mxTidy module. This is because this module does not seem to be
23     stable and it crashes on some HTML pages (which means that the
24     server would also crash)
25     """
26     response = cherrypy.response
27    
28     # the tidy tool, by its very nature it's not generator friendly,
29     # so we just collapse the body and work with it.
30     orig_body = response.collapse_body()
31    
32     fct = response.headers.get('Content-Type', '')
33     ct = fct.split(';')[0]
34     encoding = ''
35     i = fct.find('charset=')
36     if i != -1:
37         encoding = fct[i + 8:]
38    
39     if ct == 'text/html':
40         page_file = os.path.join(temp_dir, 'page.html')
41         open(page_file, 'wb').write(orig_body)
42        
43         out_file = os.path.join(temp_dir, 'tidy.out')
44         err_file = os.path.join(temp_dir, 'tidy.err')
45         tidy_enc = encoding.replace('-', '')
46         if tidy_enc:
47             tidy_enc = '-' + tidy_enc
48        
49         strict_xml = ("", " -xml")[bool(strict_xml)]
50        
51         if indent:
52             indent = ' -indent'
53         else:
54             indent = ''
55        
56         if wrap is False:
57             wrap = ''
58         else:
59             try:
60                 wrap = ' -wrap %d' % int(tidyWrap)
61             except:
62                 wrap = ''
63        
64         result = os.system('"%s" %s%s%s%s -f %s -o %s %s' %
65                            (tidy_path, tidy_enc, strict_xml, indent, wrap,
66                             err_file, out_file, page_file))
67         use_output = bool(indent or wrap) and not result
68         if use_output:
69             output = open(out_file, 'rb').read()
70        
71         new_errs = []
72         for err in open(err_file, 'rb').read().splitlines():
73             if (err.find('Error') != -1 or
74                 (warnings and err.find('Warning') != -1)):
75                 ignore = 0
76                 for err_ign in errors_to_ignore or []:
77                     if err.find(err_ign) != -1:
78                         ignore = 1
79                         break
80                 if not ignore:
81                     new_errs.append(err)
82        
83         if new_errs:
84             response.body = wrong_content('<br />'.join(new_errs), orig_body)
85             if "Content-Length" in response.headers:
86                 # Delete Content-Length header so finalize() recalcs it.
87                 del response.headers["Content-Length"]
88             return
89         elif strict_xml:
90             # The HTML is OK, but is it valid XML?
91             # Use elementtree to parse XML
92             from elementtree.ElementTree import parse
93             tag_list = ['nbsp', 'quot']
94             for tag in tag_list:
95                 orig_body = orig_body.replace('&' + tag + ';', tag.upper())
96            
97             if encoding:
98                 enctag = '<?xml version="1.0" encoding="%s"?>' % encoding
99                 orig_body = enctag + orig_body
100            
101             f = StringIO(orig_body)
102             try:
103                 tree = parse(f)
104             except:
105                 # Wrong XML
106                 body_file = StringIO()
107                 traceback.print_exc(file = body_file)
108                 body_file = '<br />'.join(body_file.getvalue())
109                 response.body = wrong_content(body_file, orig_body, "XML")
110                 if "Content-Length" in response.headers:
111                     # Delete Content-Length header so finalize() recalcs it.
112                     del response.headers["Content-Length"]
113                 return
114        
115         if use_output:
116             response.body = [output]
117             if "Content-Length" in response.headers:
118                 # Delete Content-Length header so finalize() recalcs it.
119                 del response.headers["Content-Length"]
120
121 def html_space(text):
122     """Escape text, replacing space with nbsp and tab with 4 nbsp's."""
123     return cgi.escape(text).replace('\t', '    ').replace(' ', '&nbsp;')
124
125 def html_break(text):
126     """Escape text, replacing newline with HTML br element."""
127     return cgi.escape(text).replace('\n', '<br />')
128
129 def wrong_content(header, body, content_type="HTML"):
130     output = ["Wrong %s:<br />%s<br />" % (content_type, html_break(header))]
131     for i, line in enumerate(body.splitlines()):
132         output.append("%03d - %s" % (i + 1, html_space(line)))
133     return "<br />".join(output)
134
135
136 def nsgmls(temp_dir, nsgmls_path, catalog_path, errors_to_ignore=None):
137     response = cherrypy.response
138    
139     # the tidy tool, by its very nature it's not generator friendly,
140     # so we just collect the body and work with it.
141     orig_body = response.collapse_body()
142    
143     fct = response.headers.get('Content-Type', '')
144     ct = fct.split(';')[0]
145     encoding = ''
146     i = fct.find('charset=')
147     if i != -1:
148         encoding = fct[i + 8:]
149     if ct == 'text/html':
150         # Remove bits of Javascript (nsgmls doesn't seem to handle
151         #   them correctly (for instance, if <a appears in your
152         #   Javascript code nsgmls complains about it)
153         while True:
154             i = orig_body.find('<script')
155             if i == -1:
156                 break
157             j = orig_body.find('</script>', i)
158             if j == -1:
159                 break
160             orig_body = orig_body[:i] + orig_body[j+9:]
161
162         page_file = os.path.join(temp_dir, 'page.html')
163         open(page_file, 'wb').write(orig_body)
164        
165         err_file = os.path.join(temp_dir, 'nsgmls.err')
166         command = ('%s -c%s -f%s -s -E10 %s' %
167                    (nsgmls_path, catalog_path, err_file, page_file))
168         command = command.replace('\\', '/')
169         os.system(command)
170         errs = open(err_file, 'rb').read()
171        
172         new_errs = []
173         for err in errs.splitlines():
174             ignore = False
175             for err_ign in errors_to_ignore or []:
176                 if err.find(err_ign) != -1:
177                     ignore = True
178                     break
179             if not ignore:
180                 new_errs.append(err)
181        
182         if new_errs:
183             response.body = wrong_content('<br />'.join(new_errs), orig_body)
184             if "Content-Length" in response.headers:
185                 # Delete Content-Length header so finalize() recalcs it.
186                 del response.headers["Content-Length"]
187
188 cherrypy.tools.tidy = cherrypy.Tool('before_finalize', tidy)
189 cherrypy.tools.nsgmls = cherrypy.Tool('before_finalize', nsgmls)
190
Note: See TracBrowser for help on using the browser.

Hosted by WebFaction

Log in as guest/cherrypy to create/edit wiki pages