[reportlab-users] re: Unicode pyRXP and malloc problems in pyRXP
Stuart Bishop
reportlab-users@reportlab.com
Sat, 15 Feb 2003 13:31:32 +1100
--Apple-Mail-4-397099313
Content-Transfer-Encoding: 7bit
Content-Type: text/plain;
charset=US-ASCII;
format=flowed
On Saturday, February 15, 2003, at 12:17 AM, Robin Becker wrote:
> Stuart Bishop, if you have working code for Mac uRXP I will try and
> test
> on the PC to see if your malloc issues are OS related.
It would be worth testing the bog standard pyRXP 0.9 with the example
file I posted first - it is affected as well.
> Context diff patch would be fine.
diff attached, as well as the brand new files since cvs diff doesn't
want to mention them...
test_xmltestsuite.py depends on xmltest.zip, which can be downloaded
from ftp://ftp.jclark.com/pub/xml/xmltest.zip
> The pyRXP module assumes everything is 8 bit, so it's likely that
> anything malloced by it will have wrong sizes when asked to do 16 bit
> things.
I think I've got these points all sorted.
--Apple-Mail-4-397099313
Content-Disposition: attachment;
filename=uRXP.diff
Content-Transfer-Encoding: 7bit
Content-Type: application/octet-stream;
x-unix-mode=0664;
name="uRXP.diff"
? .gdb_history
? _uRXP
? build
? oldRXP.c
? pyRXP.c.new
? setup.pyc
? examples/.gdb_history
? examples/expattree.pyc
? examples/hamlet.py
? rxp/rxp
? test/.gdb_history
? test/test_uRXP.py
? test/test_xmltestsuite.py
? test/test_xmltestsuite.py.org
? test/xmltest
? test/xmltest.zip
Index: pyRXP.c
===================================================================
RCS file: /cvsroot/reportlab/rl_addons/pyRXP/pyRXP.c,v
retrieving revision 1.10
diff -c -c -r1.10 pyRXP.c
*** pyRXP.c 25 Oct 2002 15:27:40 -0000 1.10
--- pyRXP.c 15 Feb 2003 00:41:05 -0000
***************
*** 11,17 ****
#include <stdarg.h>
#ifndef CHAR_SIZE
! #define CHAR_SIZE 8
#endif
#include "system.h"
--- 11,26 ----
#include <stdarg.h>
#ifndef CHAR_SIZE
! #error CHAR_SIZE not specified
! #endif
!
! #if CHAR_SIZE == 16
! #define MODULE "uRXP"
! #elif CHAR_SIZE == 8
! #define PYSTRING(s) PyString_FromString(s)
! #define MODULE "pyRXP"
! #else
! #error Invalid CHAR_SIZE specified
#endif
#include "system.h"
***************
*** 24,32 ****
#include "stdio16.h"
#include "version.h"
#include "namespaces.h"
! #define VERSION "0.9"
! #define MODULE "pyRXP"
#define MAX_DEPTH 256
static PyObject *moduleError;
static PyObject *moduleVersion;
static PyObject *RXPVersion;
--- 33,51 ----
#include "stdio16.h"
#include "version.h"
#include "namespaces.h"
! #define VERSION "0.9.1"
#define MAX_DEPTH 256
+
+ #if CHAR_SIZE == 16
+ PyObject* PYSTRING(const Char* s) {
+ PyObject* rV;
+ int len = 0;
+ len = (int) Strlen( s );
+ rV = PyUnicode_Decode( (const char*) s, len*2, "utf16", NULL);
+ return rV;
+ }
+ #endif
+
static PyObject *moduleError;
static PyObject *moduleVersion;
static PyObject *RXPVersion;
***************
*** 115,125 ****
looks the same as the entity reference.\n\
ReturnComments = 0\n\
If this is set, comments are returned, otherwise they are ignored.\n\
CaseInsensitive = 0\n\
ErrorOnUndefinedElements = 0\n\
ErrorOnUndefinedAttributes = 0\n\
! If these are set and there is a DTD, references to undeclared elements\n\
! and attributes are an error.\n\
WarnOnRedefinitions = 0\n\
If this is on, a warning is given for redeclared elements, attributes,\n\
entities and notations.\n"
--- 134,147 ----
looks the same as the entity reference.\n\
ReturnComments = 0\n\
If this is set, comments are returned, otherwise they are ignored.\n\
+ ReturnProcessingInstructions = 0\n\
+ If this is set, processing instructions are returned, otherwise\n\
+ they are ignored.\n\
CaseInsensitive = 0\n\
ErrorOnUndefinedElements = 0\n\
ErrorOnUndefinedAttributes = 0\n\
! If these are set and there is a DTD, references to undeclared\n\
! elements and attributes are an error.\n\
WarnOnRedefinitions = 0\n\
If this is on, a warning is given for redeclared elements, attributes,\n\
entities and notations.\n"
***************
*** 189,194 ****
--- 211,217 ----
{"ErrorOnBadCharacterEntities",1},
{"ErrorOnUndefinedEntities",1},
{"ReturnComments",0},
+ {"ReturnProcessingInstructions",0},
{"CaseInsensitive",0},
{"ErrorOnUndefinedElements",0},
{"ErrorOnUndefinedAttributes",0},
***************
*** 250,260 ****
int useNone = pd->none_on_empty && !a;
if(!useNone){
! PyObject *attrs=PyDict_New(), *t;
for(; a; a=a->next){
! PyDict_SetItemString(attrs, (char*)a->definition->name,
t=PyString_FromString(a->value));
! Py_DECREF(t);
}
return attrs;
}
--- 273,289 ----
int useNone = pd->none_on_empty && !a;
if(!useNone){
! PyObject *attrs=PyDict_New(), *t1,*t2;
for(; a; a=a->next){
! /*PyDict_SetItemString(attrs, (char*)a->definition->name,
t=PyString_FromString(a->value));
! Py_DECREF(t);*/
! PyDict_SetItem(attrs,
! t1=PYSTRING( (Char*)a->definition->name ),
! t2=PYSTRING( (Char*)a->value )
! );
! Py_DECREF(t1);
! Py_DECREF(t2);
}
return attrs;
}
***************
*** 264,273 ****
}
}
! static PyObject* makeNode(ParserDetails* pd, char *name, PyObject* attr, int empty)
{
PyObject *t = PDNode_New(4);
! PDSetItem(t, 0, PyString_FromString(name));
PDSetItem(t, 1, attr);
if(empty && pd->none_on_empty){
attr = Py_None;
--- 293,302 ----
}
}
! static PyObject* makeNode(ParserDetails* pd, const Char *name, PyObject* attr, int empty)
{
PyObject *t = PDNode_New(4);
! PDSetItem(t, 0, PYSTRING(name));
PDSetItem(t, 1, attr);
if(empty && pd->none_on_empty){
attr = Py_None;
***************
*** 285,290 ****
--- 314,323 ----
return t;
}
+ #if CHAR_SIZE == 16
+ Char* com_head;
+ Char* com_tail;
+ #endif
static int handle_bit(Parser p, XBit bit, PyObject *stack[],int *depth)
{
***************
*** 306,313 ****
}
empty = bit->type == XBIT_empty;
! t = makeNode( pd, (char*)bit->element_definition->name,
! get_attrs(pd, bit->element_definition, bit->attributes), empty);
if(empty){
PyList_Append(PDGetItem(stack[*depth],2),t);
Py_DECREF(t);
--- 339,346 ----
}
empty = bit->type == XBIT_empty;
! t = makeNode( pd, bit->element_definition->name,
! get_attrs(pd, bit->element_definition, bit->attributes), empty);
if(empty){
PyList_Append(PDGetItem(stack[*depth],2),t);
Py_DECREF(t);
***************
*** 329,346 ****
Py_DECREF(t);
break;
case XBIT_pi:
! #if 0
! bit->pi_name;
! bit->pi_chars;
! #endif
break;
case XBIT_pcdata:
! t = PyString_FromString(bit->pcdata_chars);
PyList_Append(PDGetItem(stack[*depth],2),t);
Py_DECREF(t);
break;
case XBIT_cdsect:
! t = PyString_FromString(bit->cdsect_chars);
PyList_Append(PDGetItem(stack[*depth],2),t);
Py_DECREF(t);
break;
--- 362,395 ----
Py_DECREF(t);
break;
case XBIT_pi:
! if(ParserGetFlag(p,ReturnProcessingInstructions)){
! Char* c = (Char*)PyMem_Malloc(
! (Strlen(bit->pi_name) + Strlen(bit->pi_chars) + 6)*2
! );
! Char* z = strdup_char8_to_Char("<?");
! Strcpy(c,z);
! free(z);
! Strcat(c,bit->pi_name);
! z = strdup_char8_to_Char(" ");
! Strcat(c,z);
! free(z);
! Strcat(c,bit->pi_chars);
! z = strdup_char8_to_Char("?>");
! Strcat(c,z);
! free(z);
! t = PYSTRING(c);
! PyList_Append(PDGetItem(stack[*depth],2),t);
! Py_DECREF(t);
! PyMem_Free(c);
! }
break;
case XBIT_pcdata:
! t = PYSTRING(bit->pcdata_chars);
PyList_Append(PDGetItem(stack[*depth],2),t);
Py_DECREF(t);
break;
case XBIT_cdsect:
! t = PYSTRING(bit->cdsect_chars);
PyList_Append(PDGetItem(stack[*depth],2),t);
Py_DECREF(t);
break;
***************
*** 348,362 ****
break;
case XBIT_comment:
if(ParserGetFlag(p,ReturnComments)){
char* c = (char*)PyMem_Malloc(strlen(bit->comment_chars)+8);
strcpy(c,"<!--");
strcat(c,bit->comment_chars);
strcat(c,"-->");
! t = PyString_FromString(c);
PyList_Append(PDGetItem(stack[*depth],2),t);
Py_DECREF(t);
PyMem_Free(c);
}
break;
default:
Fprintf(Stderr, "\nUnknown event type %s\n", XBitTypeName[bit->type]);
--- 397,420 ----
break;
case XBIT_comment:
if(ParserGetFlag(p,ReturnComments)){
+ #if CHAR_SIZE == 8
char* c = (char*)PyMem_Malloc(strlen(bit->comment_chars)+8);
strcpy(c,"<!--");
strcat(c,bit->comment_chars);
strcat(c,"-->");
! #elif CHAR_SIZE == 16
! Char* c = (Char*)PyMem_Malloc(Strlen(bit->comment_chars)*2+16);
! Strcpy(c,com_head);
! Strcat(c,bit->comment_chars);
! Strcat(c,com_tail);
! #endif
! t = PYSTRING(c);
PyList_Append(PDGetItem(stack[*depth],2),t);
Py_DECREF(t);
PyMem_Free(c);
+
}
+
break;
default:
Fprintf(Stderr, "\nUnknown event type %s\n", XBitTypeName[bit->type]);
***************
*** 376,382 ****
if(e->type==ET_external){
PyObject *arglist;
PyObject *result;
! arglist = Py_BuildValue("(s)",e->systemid);
result = PyEval_CallObject(eoCB, arglist);
if(result){
if(PyString_Check(result)){
--- 434,440 ----
if(e->type==ET_external){
PyObject *arglist;
PyObject *result;
! arglist = Py_BuildValue("(s)",e->systemid);/* NB. 8 bit */
result = PyEval_CallObject(eoCB, arglist);
if(result){
if(PyString_Check(result)){
***************
*** 399,413 ****
--- 457,484 ----
}
void PyErr_FromStderr(Parser p, char *msg){
+ /* Yech. This appears to be pulling the error messages
+ from the internals of RXP's Stderr. */
struct _FILE16 {
void *handle;
int handle2, handle3;
};
+ #if CHAR_SIZE == 8
char *buf=((struct _FILE16*)Stderr)->handle;
if(p->errbuf) Fprintf(Stderr,"%s\n", p->errbuf);
Fprintf(Stderr,"%s\n", msg);
buf[((struct _FILE16*)Stderr)->handle2] = 0;
PyErr_SetString(moduleError,buf);
+ #else
+ Char *buf=((struct _FILE16*)Stderr)->handle;
+ if(p->errbuf) Fprintf(Stderr,"%s\n", p->errbuf);
+ Fprintf(Stderr,"%s\n", msg);
+ buf[((struct _FILE16*)Stderr)->handle2 / 2] = 0;
+ buf[(((struct _FILE16*)Stderr)->handle2 / 2) + 1] = 0;
+ PyObject* t = PYSTRING(buf);
+ PyErr_SetObject(moduleError,t);
+ Py_DECREF(t);
+ #endif
}
/*return non zero for error*/
***************
*** 425,431 ****
}
depth = 0;
! stack[0] = makeNode( pd, "", Py_None, 0); /*stealing a reference to Py_None*/
Py_INCREF(Py_None); /*so we must correct for it*/
while(1){
XBitType bt;
--- 496,502 ----
}
depth = 0;
! stack[0] = makeNode( pd,(const Char*)"", Py_None, 0); /*stealing a reference to Py_None*/
Py_INCREF(Py_None); /*so we must correct for it*/
while(1){
XBitType bt;
***************
*** 487,492 ****
--- 558,564 ----
str = MakeFILE16FromString(buf,sizeof(buf)-1,"w");
_ParserPerror(str, pd->p, bit);
Fclose(str);
+ /* TODO: This probably needs to be unicode as well */
arglist = Py_BuildValue("(s)",buf);
result = PyEval_CallObject(pd->warnCB, arglist);
Py_DECREF(arglist);
***************
*** 675,681 ****
int i;
if(!strcmp(name,"warnCB")) return _get_OB(name,self->warnCB);
else if(!strcmp(name,"eoCB")) return _get_OB(name,self->eoCB);
! else if(!strcmp(name,"fourth")) return _get_OB(name,self->eoCB);
else if(!strcmp(name,"srcName")){
Py_INCREF(self->srcName);
return self->srcName;
--- 747,753 ----
int i;
if(!strcmp(name,"warnCB")) return _get_OB(name,self->warnCB);
else if(!strcmp(name,"eoCB")) return _get_OB(name,self->eoCB);
! else if(!strcmp(name,"fourth")) return _get_OB(name,self->fourth);
else if(!strcmp(name,"srcName")){
Py_INCREF(self->srcName);
return self->srcName;
***************
*** 755,765 ****
}
static struct PyMethodDef moduleMethods[] = {
! {"Parser", (PyCFunction)pyRXPParser, METH_VARARGS|METH_KEYWORDS, "Parser(*kw) create a pyRXP parser instance"},
{NULL, NULL} /*sentinel*/
};
DL_EXPORT(void) initpyRXP(void)
{
PyObject *m, *d, *v, *t;
int i;
--- 827,844 ----
}
static struct PyMethodDef moduleMethods[] = {
! {"Parser", (PyCFunction)pyRXPParser,
! METH_VARARGS|METH_KEYWORDS,
! "Parser(*kw) create a pyRXP parser instance"
! },
{NULL, NULL} /*sentinel*/
};
+ #if CHAR_SIZE == 16
+ DL_EXPORT(void) inituRXP(void)
+ #elif CHAR_SIZE == 8
DL_EXPORT(void) initpyRXP(void)
+ #endif
{
PyObject *m, *d, *v, *t;
int i;
***************
*** 777,786 ****
RXPVersion = PyString_FromString(rxp_version_string);
PyDict_SetItemString(d, "RXPVersion", RXPVersion );
moduleError = PyErr_NewException(MODULE ".Error",NULL,NULL);
! PyDict_SetItemString(d,"error",moduleError);
parser_flags = PyDict_New();
for(i=0;flag_vals[i].k;i++){
! PyDict_SetItemString(parser_flags, flag_vals[i].k, t=PyInt_FromLong(flag_vals[i].v));
Py_DECREF(t);
}
PyDict_SetItemString(d,"parser_flags",parser_flags);
--- 856,866 ----
RXPVersion = PyString_FromString(rxp_version_string);
PyDict_SetItemString(d, "RXPVersion", RXPVersion );
moduleError = PyErr_NewException(MODULE ".Error",NULL,NULL);
! PyDict_SetItemString(d,"error",moduleError);
parser_flags = PyDict_New();
for(i=0;flag_vals[i].k;i++){
! PyDict_SetItemString(parser_flags, flag_vals[i].k,
! t=PyInt_FromLong(flag_vals[i].v));
Py_DECREF(t);
}
PyDict_SetItemString(d,"parser_flags",parser_flags);
***************
*** 789,792 ****
--- 869,880 ----
v = PyString_FromString(moduleDoc);
PyDict_SetItemString(d, "__doc__", v);
Py_DECREF(v);
+
+ #if CHAR_SIZE == 16
+ com_head = (Char*)PyMem_Malloc(10);
+ com_tail = (Char*)PyMem_Malloc(8);
+ char8_to_Char("<!--",com_head);
+ char8_to_Char("-->",com_tail);
+ #endif
+
}
Index: setup.py
===================================================================
RCS file: /cvsroot/reportlab/rl_addons/pyRXP/setup.py,v
retrieving revision 1.6
diff -c -c -r1.6 setup.py
*** setup.py 3 May 2002 10:20:22 -0000 1.6
--- setup.py 15 Feb 2003 00:41:05 -0000
***************
*** 4,11 ****
--- 4,18 ----
#history http://cvs.sourceforge.net/cgi-bin/cvsweb.cgi/rl_addons/pyRXP/setup.py?cvsroot=reportlab
#$Header: /cvsroot/reportlab/rl_addons/pyRXP/setup.py,v 1.6 2002/05/03 10:20:22 rgbecker Exp $
if __name__=='__main__': #NO RUNTESTS
+
import os, sys
+ import shutil
from distutils.core import setup, Extension
+
+ # patch distutils if it can't cope with the "classifiers" keyword
+ if sys.version < '2.2.3':
+ from distutils.dist import DistributionMetadata
+ DistributionMetadata.classifiers = None
def raiseConfigError(msg):
import exceptions
***************
*** 13,23 ****
pass
raise ConfigError(msg)
RXPDIR='rxp'
RXPLIBSOURCES=[]
! for f in ('xmlparser.c', 'url.c', 'charset.c', 'string16.c', 'ctype16.c', 'dtd.c',
! 'input.c', 'stdio16.c', 'system.c', 'hash.c', 'version.c', 'namespaces.c', 'http.c'):
! RXPLIBSOURCES.append(os.path.join(RXPDIR,f))
if sys.platform=="win32":
LIBS=['wsock32']
--- 20,46 ----
pass
raise ConfigError(msg)
+ # We copy the rxp source - we need to build it a second time for uRXP
+ # with different compile time flags
+ if os.path.exists('_uRXP'):
+ shutil.rmtree('_uRXP')
+ os.makedirs('_uRXP')
+
RXPDIR='rxp'
+ uRXPDIR='_uRXP'
RXPLIBSOURCES=[]
! uRXPLIBSOURCES=[]
! for f in ('xmlparser.c', 'url.c', 'charset.c', 'string16.c', 'ctype16.c',
! 'dtd.c', 'input.c', 'stdio16.c', 'system.c', 'hash.c',
! 'version.c', 'namespaces.c', 'http.c'):
! RXP_file = os.path.join(RXPDIR,f)
! uRXP_file = os.path.join(uRXPDIR,f)
! RXPLIBSOURCES.append(RXP_file)
! shutil.copy2(RXP_file,uRXP_file)
! uRXPLIBSOURCES.append(uRXP_file)
! uRXP_c = os.path.join(uRXPDIR,'uRXP.c')
! shutil.copy2('pyRXP.c',uRXP_c)
! uRXPLIBSOURCES.append(uRXP_c)
if sys.platform=="win32":
LIBS=['wsock32']
***************
*** 30,61 ****
else:
msg = "Don't know about system %s" % sys.platform
if int(os.environ.get('LIBERROR',1)):
! raiseConfigError(msg+'\nset environment LIBERROR=0 to try no extra libs')
else:
print msg
LIBS=[]
-
setup( name = "pyRXP",
! version = "0.5",
! description = "Python RXP interface",
author = "Robin Becker",
author_email = "robin@reportlab.com",
url = "http://www.reportlab.com",
packages = [],
ext_modules = [Extension( 'pyRXP',
! ['pyRXP.c']+RXPLIBSOURCES,
include_dirs=[RXPDIR],
! define_macros=[('CHAR_SIZE', 8)],
library_dirs=[],
-
# libraries to link against
libraries=LIBS,
),
! ]
)
! if sys.platform=='win32' and ('install' in sys.argv or 'install_ext' in sys.argv):
def MovePYDs(*F):
for x in sys.argv:
if x[:18]=='--install-platlib=': return
--- 53,112 ----
else:
msg = "Don't know about system %s" % sys.platform
if int(os.environ.get('LIBERROR',1)):
! raiseConfigError(
! msg+'\nset environment LIBERROR=0 to try no extra libs'
! )
else:
print msg
LIBS=[]
setup( name = "pyRXP",
! version = "0.9.1",
! description = "Python RXP interface - fast validating XML parser",
author = "Robin Becker",
author_email = "robin@reportlab.com",
url = "http://www.reportlab.com",
packages = [],
ext_modules = [Extension( 'pyRXP',
! ['oldRXP.c']+RXPLIBSOURCES,
include_dirs=[RXPDIR],
! define_macros=[
! ('CHAR_SIZE', 8),
! ],
! library_dirs=[],
! # libraries to link against
! libraries=LIBS,
! ),
! Extension( 'uRXP',
! uRXPLIBSOURCES,
! include_dirs=[RXPDIR],
! define_macros=[
! ('CHAR_SIZE', 16),
! ],
library_dirs=[],
# libraries to link against
libraries=LIBS,
),
! ],
! license = open(os.path.join('rxp','COPYING')).read(),
! classifiers = [
! 'Development Status :: 5 - Production/Stable',
! 'Intended Audience :: Developers',
! 'License :: OSI Approved :: GNU General Public License (GPL)',
! 'Programming Language :: Python',
! 'Programming Language :: C',
! 'Operating System :: Unix',
! 'Operating System :: POSIX',
! 'Operating System :: Microsoft :: Windows',
! 'Topic :: Software Development :: Libraries :: Python Modules',
! 'Topic :: Text Processing :: Markup :: XML',
! ]
)
+ #if os.path.exists('_uRXP'):
+ # shutil.rmtree('_uRXP')
! if sys.platform=='win32' and ('install' in sys.argv
! or 'install_ext' in sys.argv):
def MovePYDs(*F):
for x in sys.argv:
if x[:18]=='--install-platlib=': return
***************
*** 71,73 ****
--- 122,125 ----
os.rename(srcf,dstf)
print 'Renaming %s to %s' % (srcf, dstf)
MovePYDs('pyRXP.pyd',)
+ MovePYDs('uRXP.pyd',)
Index: examples/benchmarks.py
===================================================================
RCS file: /cvsroot/reportlab/rl_addons/pyRXP/examples/benchmarks.py,v
retrieving revision 1.1
diff -c -c -r1.1 benchmarks.py
*** examples/benchmarks.py 29 Apr 2002 13:54:15 -0000 1.1
--- examples/benchmarks.py 15 Feb 2003 00:41:05 -0000
***************
*** 15,20 ****
--- 15,22 ----
import string
from types import TupleType
import cStringIO
+ import os
+ import os.path
def tupleTreeStats(node):
# counts tags and attributes recursively
***************
*** 47,52 ****
--- 49,70 ----
def parseWithPyRXP(parser, rawdata):
return parser.parse(rawdata)
+ ### uRXP - Unicode version of pyRXP
+
+ def getuRXPParser():
+ import uRXP
+ p = uRXP.Parser()
+ return p
+
+ def getNonValidatinguRXPParser():
+ import uRXP
+ p = uRXP.Parser(Validate=0)
+ return p
+
+ def parseWithuRXP(parser, rawdata):
+ return parser.parse(rawdata)
+
+
### rparsexml - Aaron's very fast pure python parser
def loadRparseXML():
***************
*** 130,135 ****
--- 148,156 ----
# function to parse; function to do stats
('pyRXP', getPyRXPParser, parseWithPyRXP, tupleTreeStats),
('pyRXP_nonvalidating', getNonValidatingPyRXPParser, parseWithPyRXP, tupleTreeStats),
+ ('uRXP', getuRXPParser, parseWithuRXP, tupleTreeStats),
+ ('uRXP_nonvalidating', getNonValidatinguRXPParser, parseWithuRXP,
+ tupleTreeStats),
('rparsexml', loadRparseXML, parseWithRParseXML, tupleTreeStats),
('expat', getExpatParser, parseWithExpat, tupleTreeStats),
('minidom', loadMiniDOM, parseWithMiniDOM, statsWithMiniDOM),
Index: rxp/xmlparser.h
===================================================================
RCS file: /cvsroot/reportlab/rl_addons/pyRXP/rxp/xmlparser.h,v
retrieving revision 1.2
diff -c -c -r1.2 xmlparser.h
*** rxp/xmlparser.h 22 Mar 2002 11:00:37 -0000 1.2
--- rxp/xmlparser.h 15 Feb 2003 00:41:07 -0000
***************
*** 103,108 ****
--- 103,109 ----
ErrorOnBadCharacterEntities,
ErrorOnUndefinedEntities,
ReturnComments,
+ ReturnProcessingInstructions,
CaseInsensitive,
ErrorOnUndefinedElements,
ErrorOnUndefinedAttributes,
--Apple-Mail-4-397099313
Content-Disposition: attachment;
filename=test_uRXP.py
Content-Transfer-Encoding: 7bit
Content-Type: application/octet-stream;
x-unix-mode=0664;
name="test_uRXP.py"
#!/usr/bin/env python
'''
$Id$
'''
__rcs_id__ = '$Id$'
__version__ = '$Revision: 0.0 $'[11:-2]
__author__ = 'Stuart Bishop <stuart@stuartbishop.net>'
import unittest
import uRXP
class test_uRXP(unittest.TestCase):
def setUp(self):
self.parser = uRXP.Parser()
self.parse = self.parser.parse
def test_DefaultExceptionHandling(self):
xml = '''<foo>'''
try:
self.parse(xml,SimpleErrorFormat = 0)
except uRXP.error,x:
if str(x).startswith('Error: Document ends too soon'):
pass
else:
self.fail("Bad SimpleErrorFormat message: %r (%r)" % (x,str(x)))
try:
self.parse(xml,SimpleErrorFormat = 1)
except uRXP.error,x:
if str(x).startswith('[unknown]:1:6: Document ends too soon'):
pass
else:
self.fail("Bad SimpleErrorFormat message: %r (%r)" % (x,str(x)))
def test_ExpandDecimalEntities(self):
xml = '''<theory>e = m c²</theory>'''
r = self.parse(xml)
self.assertEqual(r[2][0],u'e = m c\xb2')
def test_ExpandHexEntities(self):
xml = '''<theory>e = m c²</theory>'''
r = self.parse(xml)
self.assertEqual(r[2][0],u'e = m c\xb2')
def test_ExpandAttributeEntities(self):
xml = '''<currency name="pound" symbol="₤"/>'''
r = self.parse(xml)
self.assertEqual(r[1]['symbol'],u'\u20a4')
def test_Comments(self):
xml = '''<a><!-- Comment --></a>'''
self.parser.ReturnComments = 1
r = self.parse(xml)
self.assertEqual(r[2][0],u'<!-- Comment -->')
def test_EntitiesInComments(self):
# Entities should not get expanded
self.parser.ReturnComments = 1
xml = '''<a><!-- ² --></a>'''
r = self.parse(xml)
self.assertEqual(r[2][0],u'<!-- ² -->')
def test_ProcessingInstructions(self):
xml = '''<a><?some junk?></a>'''
self.parser.ReturnProcessingInstructions = 1
r = self.parse(xml)
self.assertEqual(r[2][0],u'<?some junk?>')
def test_EntitiesInProcessingInstructions(self):
xml = '''<a><?some junk '² ?></a>'''
self.parser.ReturnProcessingInstructions = 1
r = self.parse(xml)
self.assertEqual(r[2][0],u"<?some junk '² ?>")
def test_Prolog(self):
xml = '''<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE fixthistest>
<doc/>
'''
self.assertEqual(self.parse(xml)[0],u'doc')
self.parser.ReturnProlog = 1
r = self.parse(xml)
self.assertEqual(r[0],(u'doc', None, None, None))
self.parser.ReturnProcessingInstructions = 1
r = self.parse(xml)
self.assertEqual(r[0],u'<?xml version="1.0" encoding="utf-8"?><doc/>')
self.assertEqual(r[1],(u'doc', None, None, None))
self.parser.ReturnProcessingInstructions = 1
self.parser.ReturnSomethingOrOtherToFix = 1
self.assertEqual(r[0],u'<?xml version="1.0" encoding="utf-8"?><doc/>')
self.assertEqual(r[1],u'<!DOCTYPE fixthistest>')
self.assertEqual(r[2],(u'doc', None, None, None))
if __name__ == '__main__':
unittest.main()
--Apple-Mail-4-397099313
Content-Disposition: attachment;
filename=test_xmltestsuite.py
Content-Transfer-Encoding: 7bit
Content-Type: application/octet-stream;
x-unix-mode=0644;
name="test_xmltestsuite.py"
#!/usr/bin/env python
'''
$Id: test_xmltestsuite.py,v 1.1 2003/02/08 16:35:39 zen Exp $
Test parsing and validation against James Clark's test cases,
as downloaded from http://www.jclark.com/xml/
'''
__rcs_id__ = '$Id: test_xmltestsuite.py,v 1.1 2003/02/08 16:35:39 zen Exp $'
__version__ = '$Revision: 1.1 $'[11:-2]
__author__ = 'Stuart Bishop <stuart@stuartbishop.net>'
debug = 0
import unittest
import zipfile
import sys
import os
import os.path
import pyRXP
try:
import uRXP
except:
uRXP = None
import codecs
if debug: import time
# 2.2 compatibility - sort of
try:
__file__
except NameError:
__file__ = os.path.join(os.getcwd(),'oops')
class test_uRXP_XMLTestSuite(unittest.TestCase):
mod = uRXP
def setUp(self):
self.valid = []
self.invalid = []
self.notwf = []
self.testdir = os.path.dirname(__file__)
zipf = zipfile.ZipFile(os.path.join(self.testdir,'xmltest.zip'))
names = zipf.namelist()
for zipname in names:
osname = os.path.join(*zipname.split('/')) # For non-unixes
osname = os.path.join(self.testdir,osname)
dir = os.path.dirname(osname)
if not os.path.isdir(dir):
os.makedirs(dir)
if not os.path.isfile(osname):
f = open(osname,'wb')
f.write(zipf.read(zipname))
f.close()
if zipname.find('out') == -1:
if zipname.find('invalid') != -1:
self.invalid.append(osname)
elif zipname.find('not-wf') != -1:
self.notwf.append(osname)
elif zipname.find('valid') != -1:
outname = os.path.join(dir,'out',os.path.basename(osname))
self.valid.append( (osname,outname) )
def parse(self,filename,**kw):
if debug: print >> sys.stderr,'About to parse %s' % filename
kw = kw.copy()
kw['ReturnComments'] = 1
kw['ExpandEmpty'] = 1
#kw['ReturnProcessingInstructions'] = 1
parser = self.mod.Parser(**kw)
# Change directory in case we are loading entities from cwd
retdir = os.getcwd()
d,n = os.path.split(filename)
os.chdir(d)
try:
f = open(n)
xml = f.read()
return parser.parse(xml)
finally:
try:
f.close()
except:
pass
os.chdir(retdir)
if debug: print >> sys.stderr,'Done parsing %s' % filename
if debug: print >> sys.stderr,'='*60
if debug: time.sleep(1)
def getcanonical(self,filename):
''' Parse in the named file, and return it as canonical XML '''
return self._getcan(self.parse(filename))
def _getcan(self,node):
if type(node) in (type(''),type(u'')):
if node.startswith(u'<?') or node.startswith(u'<!'):
return node
else:
return self._quote(node)
tag,attrs,kids,junk = node
if attrs is None:
attrs = ''
else:
keys = attrs.keys()
keys.sort() # Attributes in lexical order
attrs = ' '.join(
['%s="%s"' % (k,self._quote(attrs[k])) for k in keys]
)
if attrs:
attrs = ' ' + attrs
text = ''.join([self._getcan(kid) for kid in kids])
return '<%s%s>%s</%s>' % (tag,attrs,text,tag)
def _quote(self,txt):
txt = txt.replace('&','&')
txt = txt.replace('<','<')
txt = txt.replace('>','>')
txt = txt.replace('"','"')
txt = txt.replace('\x09','	')
txt = txt.replace('\x0a',' ')
txt = txt.replace('\x0d',' ')
return txt
def test_valid(self):
for inname,outname in self.valid:
inxml = self.getcanonical(inname)
f = codecs.open(outname,mode='r',encoding='utf8')
outxml = f.read()
f.close()
self.assertEqual(inxml,outxml)
def test_invalid_parse(self):
for inname in self.invalid:
try:
self.parse(inname,Validate=0)
except self.mod.error,x:
self.fail('Failed to parse %r in non-validating mode' % inname)
def test_invalid_validate(self):
for inname in self.invalid:
try:
self.parse(inname,Validate=1)
self.fail('Failed to detect validity error in %r' % inname)
except self.mod.error:
pass
def test_notwf(self):
for inname in self.notwf:
try:
self.parse(inname,Validate=0)
self.fail('Failed to detect well-formedness in %r' % inname)
except self.mod.error:
pass
class test_pyRXP_XMLTestSuite(test_uRXP_XMLTestSuite):
mod = pyRXP
if __name__ == '__main__':
unittest.main()
--Apple-Mail-4-397099313
Content-Transfer-Encoding: 7bit
Content-Type: text/plain;
charset=US-ASCII;
format=flowed
--
Stuart Bishop <zen@shangri-la.dropbear.id.au>
http://shangri-la.dropbear.id.au/
--Apple-Mail-4-397099313--