[reportlab-users] new pyRXP
Stuart Bishop
reportlab-users@reportlab.com
Thu, 10 Apr 2003 16:33:46 +1000
--Apple-Mail-2-782266559
Content-Transfer-Encoding: 7bit
Content-Type: text/plain;
charset=US-ASCII;
format=flowed
On Wednesday, April 9, 2003, at 05:25 PM, Robin Becker wrote:
> In article <6CDB856C-6A39-11D7-9FD0-000393B63DDC@shangri-
> la.dropbear.id.au>, Stuart Bishop <zen@shangri-la.dropbear.id.au>
> writes
> .....
>> There is still a double free around, and I can trigger it with my
>> test suite. It seems to be in the parser cleanup routines. I'll have
>> a closer look at this when I get a chance, but anyone more familiar
>> with malloc debugging and the time is welcome to do it for me :-)
>> I've attached the test cases that can trigger the issue in case anyone
>> is feeling enthusiastic. This particular problem should only bite
>> people
>> who are parsing XML from untrusted sources since it takes particularly
>> broken XML to trigger the malloc errors as far as I can tell.
> .... if I get some more time I'll have a go at this, especially since
> we
> now have a later rxp source. Thanks for the testing, are you now using
> tests based on the later comment/processing instruction handling? I saw
Forgot about the new comment/pi handling - which I need (not for the
comments or pi's themselves, but to make sure that I handle CDATA
sections
properly - before I couldn't tell the diference between <!-- foo --> and
<![CDATA[<!-- foo-->]]> ). Yup - seems to be running well :-)
> also that you were attempting to get at the prolog in some way, is that
> still required? It seems quite deeply buried in rxp.
pyRXPU does everything I need now - the only reason I would have for
this
is to make the parser pass a few more anal compliance tests. I've
attached
the test case - except for the tests that require access to the prolog,
I
think pyRXP passes everything that RXP does which is good (although I
can't
tell for sure until I can run the entire test suite without causing a
core
dump :) ). I can think of valid use cases where you need this
information
though, so it would be a nifty feature. From a look at the code, I can't
quite figure out why it isn't already including the prolog :-/
--Apple-Mail-2-782266559
Content-Disposition: attachment;
filename=test_xmltestsuite.py
Content-Transfer-Encoding: 7bit
Content-Type: application/octet-stream;
x-unix-mode=0644;
name="test_xmltestsuite.py"
#!/usr/bin/env python
'''
$Id: test_xmltestsuite.py,v 1.1 2003/02/08 16:35:39 zen Exp $
Test parsing and validation against James Clark's test cases,
as downloaded from http://www.jclark.com/xml/
The .zip file should be in the same directory as this script.
Note that the .zip file can be freely distributed in unmodified form
so it could be added to the pyRXP distribution.
'''
__rcs_id__ = '$Id: test_xmltestsuite.py,v 1.1 2003/02/08 16:35:39 zen Exp $'
__version__ = '$Revision: 1.1 $'[11:-2]
__author__ = 'Stuart Bishop <stuart@stuartbishop.net>'
debug = 0
import unittest
import zipfile
import sys
import os
import os.path
import pyRXPU
import codecs
# Debug is to help me trace down memory bugs
if debug: import time
# 2.2 compatibility - sort of
try:
__file__
except NameError:
__file__ = os.path.join(os.getcwd(),'oops')
class test_pyRXPU(unittest.TestCase):
mod = pyRXPU
def parse(self,filename,**kw):
if debug: print >> sys.stderr,'About to parse %s' % filename
kw = kw.copy()
kw['ReturnComments'] = 1
kw['ExpandEmpty'] = 1
kw['ReturnProcessingInstructions'] = 1
parser = self.mod.Parser(**kw)
# Change directory in case we are loading entities from cwd
retdir = os.getcwd()
d,n = os.path.split(filename)
os.chdir(d)
try:
f = open(n)
xml = f.read()
return parser.parse(xml)
finally:
try:
f.close()
except:
pass
os.chdir(retdir)
if debug: print >> sys.stderr,'Done parsing %s' % filename
if debug: print >> sys.stderr,'='*60
if debug: time.sleep(1)
def getcanonical(self,filename):
''' Parse in the named file, and return it as canonical XML '''
return self._getcan(self.parse(filename))
def _getcan(self,node):
if type(node) in (type(''),type(u'')):
#if node.startswith(u'<?') or node.startswith(u'<!'):
# return node
#else:
return self._quote(node)
tag,attrs,kids,junk = node
if tag == self.mod.commentTagName:
return u'<!--%s-->' % (kids[0])
elif tag == self.mod.piTagName:
return u'<?%s %s?>' % (attrs['name'],kids[0])
if attrs is None:
attrs = ''
else:
keys = attrs.keys()
keys.sort() # Attributes in lexical order
attrs = ' '.join(
['%s="%s"' % (k,self._quote(attrs[k])) for k in keys]
)
if attrs:
attrs = ' ' + attrs
text = ''.join([self._getcan(kid) for kid in kids])
return '<%s%s>%s</%s>' % (tag,attrs,text,tag)
def _quote(self,txt):
txt = txt.replace('&','&')
txt = txt.replace('<','<')
txt = txt.replace('>','>')
txt = txt.replace('"','"')
txt = txt.replace('\x09','	')
txt = txt.replace('\x0a',' ')
txt = txt.replace('\x0d',' ')
return txt
def _test_valid(self,inname,outname):
inxml = self.getcanonical(inname)
f = codecs.open(outname,mode='r',encoding='utf8')
outxml = f.read()
f.close()
self.assertEqual(inxml,outxml)
def _test_invalid_parse(self,inname):
try:
self.parse(inname,Validate=0)
except self.mod.error,x:
self.fail('Failed to parse %r in non-validating mode' % inname)
def _test_invalid_validate(self,inname):
try:
self.parse(inname,Validate=1)
self.fail('Failed to detect validity error in %r' % inname)
except self.mod.error:
pass
def _test_notwf(self,inname):
try:
self.parse(inname,Validate=0)
self.fail(
'Failed to detect that %r was not well formed' % inname
)
except self.mod.error:
pass
def buildup_test(cls=test_pyRXPU):
''' Add test methods to the TestCase '''
cls.valid = []
cls.invalid = []
cls.notwf = []
testdir = os.path.dirname(__file__)
zipf = zipfile.ZipFile(os.path.join(testdir,'xmltest.zip'))
for zipname in zipf.namelist():
# Extract the files if they don't alrady exist
osname = os.path.join(*zipname.split('/')) # For non-unixes
osname = os.path.join(testdir,osname)
dir = os.path.dirname(osname)
if not os.path.isdir(dir):
os.makedirs(dir)
if not os.path.isfile(osname):
f = open(osname,'wb')
f.write(zipf.read(zipname))
f.close()
# Add input files to our lists
if os.path.splitext(osname)[1] == '.xml' and zipname.find('out') == -1:
if zipname.find('invalid') != -1:
cls.invalid.append(osname)
elif zipname.find('not-wf') != -1:
cls.notwf.append(osname)
elif zipname.find('valid') != -1:
outname = os.path.join(dir,'out',os.path.basename(osname))
cls.valid.append( (osname,outname) )
# Add 'valid' tests
for inname,outname in cls.valid:
num = int(os.path.splitext(os.path.basename(inname))[0])
dir = os.path.split(os.path.split(inname)[0])[1]
mname = 'test_Valid_%s_%03d' % (dir,num)
def doTest(self,inname=inname,outname=outname):
self._test_valid(inname,outname)
setattr(cls,mname,doTest)
# Add 'invalid' tests
for inname in cls.invalid:
num = int(os.path.splitext(os.path.basename(inname))[0])
mname = 'test_InvalidParse_%03d' % (num)
def doTest(self,inname=inname):
self._test_invalid_parse(inname)
setattr(cls,mname,doTest)
mname = 'test_InvalidValidate_%03d' % (num)
def doTest(self,inname=inname):
self._test_invalid_validate(inname)
setattr(cls,mname,doTest)
# Add 'not wellformed' tests
for inname in cls.notwf:
num = int(os.path.splitext(os.path.basename(inname))[0])
dir = os.path.split(os.path.split(inname)[0])[1]
mname = 'test_NotWellFormed_%s_%03d' % (dir,num)
def doTest(self,inname=inname):
self._test_notwf(inname)
setattr(cls,mname,doTest)
buildup_test()
if __name__ == '__main__':
unittest.main()
--Apple-Mail-2-782266559
Content-Transfer-Encoding: 7bit
Content-Type: text/plain;
charset=US-ASCII;
format=flowed
--
Stuart Bishop <zen@shangri-la.dropbear.id.au>
http://shangri-la.dropbear.id.au/
--Apple-Mail-2-782266559--