[reportlab-users] new pyRXP

Stuart Bishop reportlab-users@reportlab.com
Thu, 10 Apr 2003 16:33:46 +1000


--Apple-Mail-2-782266559
Content-Transfer-Encoding: 7bit
Content-Type: text/plain;
	charset=US-ASCII;
	format=flowed


On Wednesday, April 9, 2003, at 05:25  PM, Robin Becker wrote:

> In article <6CDB856C-6A39-11D7-9FD0-000393B63DDC@shangri-
> la.dropbear.id.au>, Stuart Bishop <zen@shangri-la.dropbear.id.au> 
> writes
> .....
>> There is still a double free around, and I can trigger it with my
>> test suite. It seems to be in the parser cleanup routines. I'll have
>> a closer look at this when I get a chance, but anyone more familiar
>> with malloc debugging and the time is welcome to do it for me :-)
>> I've attached the test cases that can trigger the issue in case anyone
>> is feeling enthusiastic. This particular problem should only bite 
>> people
>> who are parsing XML from untrusted sources since it takes particularly
>> broken XML to trigger the malloc errors as far as I can tell.
> .... if I get some more time I'll have a go at this, especially since 
> we
> now have a later rxp source. Thanks for the testing, are you now using
> tests based on the later comment/processing instruction handling? I saw

Forgot about the new comment/pi handling - which I need (not for the
comments or pi's themselves, but to make sure that I handle CDATA 
sections
properly - before I couldn't tell the diference between <!-- foo --> and
<![CDATA[<!-- foo-->]]> ). Yup - seems to be running well :-)

> also that you were attempting to get at the prolog in some way, is that
> still required? It seems quite deeply buried in rxp.

pyRXPU does everything I need now - the only reason I would have for 
this
is to make the parser pass a few more anal compliance tests. I've 
attached
the test case - except for the tests that require access to the prolog, 
I
think pyRXP passes everything that RXP does which is good (although I 
can't
tell for sure until I can run the entire test suite without causing a 
core
dump :) ). I can think of valid use cases where you need this 
information
though, so it would be a nifty feature. From a look at the code, I can't
quite figure out why it isn't already including the prolog :-/



--Apple-Mail-2-782266559
Content-Disposition: attachment;
	filename=test_xmltestsuite.py
Content-Transfer-Encoding: 7bit
Content-Type: application/octet-stream;
	x-unix-mode=0644;
	name="test_xmltestsuite.py"

#!/usr/bin/env python
'''
$Id: test_xmltestsuite.py,v 1.1 2003/02/08 16:35:39 zen Exp $
Test parsing and validation against James Clark's test cases,
as downloaded from http://www.jclark.com/xml/
The .zip file should be in the same directory as this script.
Note that the .zip file can be freely distributed in unmodified form
so it could be added to the pyRXP distribution.
'''

__rcs_id__  = '$Id: test_xmltestsuite.py,v 1.1 2003/02/08 16:35:39 zen Exp $'
__version__ = '$Revision: 1.1 $'[11:-2]
__author__ = 'Stuart Bishop <stuart@stuartbishop.net>'

debug = 0

import unittest
import zipfile
import sys
import os
import os.path
import pyRXPU
import codecs

# Debug is to help me trace down memory bugs
if debug: import time

# 2.2 compatibility - sort of
try:
    __file__
except NameError:
    __file__ = os.path.join(os.getcwd(),'oops')

class test_pyRXPU(unittest.TestCase):
    mod = pyRXPU
    
    def parse(self,filename,**kw):
        if debug: print >> sys.stderr,'About to parse %s' % filename
        kw = kw.copy()
        kw['ReturnComments'] = 1
        kw['ExpandEmpty'] = 1
        kw['ReturnProcessingInstructions'] = 1
        parser = self.mod.Parser(**kw)
        # Change directory in case we are loading entities from cwd
        retdir = os.getcwd()
        d,n = os.path.split(filename)
        os.chdir(d)
        try:
            f = open(n)
            xml = f.read()
            return parser.parse(xml)
        finally:
            try:
                f.close()
            except:
                pass
            os.chdir(retdir)
            if debug: print >> sys.stderr,'Done parsing   %s' % filename
            if debug: print >> sys.stderr,'='*60
            if debug: time.sleep(1)

    def getcanonical(self,filename):
        ''' Parse in the named file, and return it as canonical XML '''
        return self._getcan(self.parse(filename))

    def _getcan(self,node):
        if type(node) in (type(''),type(u'')):
            #if node.startswith(u'<?') or node.startswith(u'<!'):
            #    return node
            #else:
            return self._quote(node)

        tag,attrs,kids,junk = node

        if tag == self.mod.commentTagName:
            return u'<!--%s-->' % (kids[0])
        elif tag == self.mod.piTagName:
            return u'<?%s %s?>' % (attrs['name'],kids[0])

        if attrs is None:
            attrs = ''
        else:
            keys = attrs.keys()
            keys.sort() # Attributes in lexical order
            attrs = ' '.join(
                ['%s="%s"' % (k,self._quote(attrs[k])) for k in keys]
                )
            if attrs:
                attrs = ' ' + attrs

        text = ''.join([self._getcan(kid) for kid in kids])

        return '<%s%s>%s</%s>' % (tag,attrs,text,tag)

    def _quote(self,txt):
        txt = txt.replace('&','&amp;')
        txt = txt.replace('<','&lt;')
        txt = txt.replace('>','&gt;')
        txt = txt.replace('"','&quot;')
        txt = txt.replace('\x09','&#9;')
        txt = txt.replace('\x0a','&#10;')
        txt = txt.replace('\x0d','&#13;')
        return txt

    def _test_valid(self,inname,outname):
        inxml = self.getcanonical(inname)
        f = codecs.open(outname,mode='r',encoding='utf8')
        outxml = f.read()
        f.close()
        self.assertEqual(inxml,outxml)

    def _test_invalid_parse(self,inname):
        try:
            self.parse(inname,Validate=0)
        except self.mod.error,x:
            self.fail('Failed to parse %r in non-validating mode' % inname)

    def _test_invalid_validate(self,inname):
        try:
            self.parse(inname,Validate=1)
            self.fail('Failed to detect validity error in %r' % inname)
        except self.mod.error:
            pass

    def _test_notwf(self,inname):
        try:
            self.parse(inname,Validate=0)
            self.fail(
                'Failed to detect that %r was not well formed' % inname
                )
        except self.mod.error:
            pass

def buildup_test(cls=test_pyRXPU):
    ''' Add test methods to the TestCase '''
    cls.valid = []
    cls.invalid = []
    cls.notwf = []
    testdir = os.path.dirname(__file__)
    zipf = zipfile.ZipFile(os.path.join(testdir,'xmltest.zip'))
    for zipname in zipf.namelist():

        # Extract the files if they don't alrady exist
        osname = os.path.join(*zipname.split('/')) # For non-unixes
        osname = os.path.join(testdir,osname)
        dir = os.path.dirname(osname)
        if not os.path.isdir(dir):
            os.makedirs(dir)
        if not os.path.isfile(osname):
            f = open(osname,'wb')
            f.write(zipf.read(zipname))
            f.close()

        # Add input files to our lists
        if os.path.splitext(osname)[1] == '.xml' and zipname.find('out') == -1:
            if zipname.find('invalid') != -1:
                cls.invalid.append(osname)
            elif zipname.find('not-wf') != -1:
                cls.notwf.append(osname)
            elif zipname.find('valid') != -1:
                outname = os.path.join(dir,'out',os.path.basename(osname))
                cls.valid.append( (osname,outname) )

    # Add 'valid' tests
    for inname,outname in cls.valid:
        num = int(os.path.splitext(os.path.basename(inname))[0])
        dir = os.path.split(os.path.split(inname)[0])[1]
        mname = 'test_Valid_%s_%03d' % (dir,num)
        def doTest(self,inname=inname,outname=outname):
            self._test_valid(inname,outname)
        setattr(cls,mname,doTest)

    # Add 'invalid' tests
    for inname in cls.invalid:
        num = int(os.path.splitext(os.path.basename(inname))[0])
        mname = 'test_InvalidParse_%03d' % (num)
        def doTest(self,inname=inname):
            self._test_invalid_parse(inname)
        setattr(cls,mname,doTest)
        mname = 'test_InvalidValidate_%03d' % (num)
        def doTest(self,inname=inname):
            self._test_invalid_validate(inname)
        setattr(cls,mname,doTest)

    # Add 'not wellformed' tests
    for inname in cls.notwf:
        num = int(os.path.splitext(os.path.basename(inname))[0])
        dir = os.path.split(os.path.split(inname)[0])[1]
        mname = 'test_NotWellFormed_%s_%03d' % (dir,num)
        def doTest(self,inname=inname):
            self._test_notwf(inname)
        setattr(cls,mname,doTest)
        
buildup_test()

if __name__ == '__main__':
    unittest.main()


--Apple-Mail-2-782266559
Content-Transfer-Encoding: 7bit
Content-Type: text/plain;
	charset=US-ASCII;
	format=flowed



-- 
Stuart Bishop <zen@shangri-la.dropbear.id.au>
http://shangri-la.dropbear.id.au/


--Apple-Mail-2-782266559--