[reportlab-users] re: Unicode pyRXP and malloc problems in pyRXP

Sat, 15 Feb 2003 13:31:32 +1100

--Apple-Mail-4-397099313
Content-Transfer-Encoding: 7bit
Content-Type: text/plain;
	charset=US-ASCII;
	format=flowed

On Saturday, February 15, 2003, at 12:17  AM, Robin Becker wrote:

> Stuart Bishop, if you have working code for Mac uRXP I will try and 
> test
> on the PC to see if your malloc issues are OS related.

It would be worth testing the bog standard pyRXP 0.9 with the example
file I posted first - it is affected as well.

> Context diff patch would be fine.

diff attached, as well as the brand new files since cvs diff doesn't
want to mention them...

test_xmltestsuite.py depends on xmltest.zip, which can be downloaded
from ftp://ftp.jclark.com/pub/xml/xmltest.zip

> The pyRXP module assumes everything is 8 bit, so it's likely that
> anything malloced by it will have wrong sizes when asked to do 16 bit
> things.

I think I've got these points all sorted.

--Apple-Mail-4-397099313
Content-Disposition: attachment;
	filename=uRXP.diff
Content-Transfer-Encoding: 7bit
Content-Type: application/octet-stream;
	x-unix-mode=0664;
	name="uRXP.diff"

? .gdb_history
? _uRXP
? build
? oldRXP.c
? pyRXP.c.new
? setup.pyc
? examples/.gdb_history
? examples/expattree.pyc
? examples/hamlet.py
? rxp/rxp
? test/.gdb_history
? test/test_uRXP.py
? test/test_xmltestsuite.py
? test/test_xmltestsuite.py.org
? test/xmltest
? test/xmltest.zip
Index: pyRXP.c
===================================================================
RCS file: /cvsroot/reportlab/rl_addons/pyRXP/pyRXP.c,v
retrieving revision 1.10
diff -c -c -r1.10 pyRXP.c
*** pyRXP.c	25 Oct 2002 15:27:40 -0000	1.10
--- pyRXP.c	15 Feb 2003 00:41:05 -0000
***************
*** 11,17 ****
  #include <stdarg.h>

  #ifndef CHAR_SIZE
! #define CHAR_SIZE 8
  #endif

  #include "system.h"
--- 11,26 ----
  #include <stdarg.h>

  #ifndef CHAR_SIZE
! #error CHAR_SIZE not specified
! #endif
! 
! #if CHAR_SIZE == 16
! #define MODULE "uRXP"
! #elif CHAR_SIZE == 8
! #define PYSTRING(s) PyString_FromString(s)
! #define MODULE "pyRXP"
! #else
! #error Invalid CHAR_SIZE specified
  #endif

  #include "system.h"
***************
*** 24,32 ****
  #include "stdio16.h"
  #include "version.h"
  #include "namespaces.h"
! #define VERSION "0.9"
! #define MODULE "pyRXP"
  #define MAX_DEPTH 256
  static PyObject *moduleError;
  static PyObject *moduleVersion;
  static PyObject *RXPVersion;
--- 33,51 ----
  #include "stdio16.h"
  #include "version.h"
  #include "namespaces.h"
! #define VERSION "0.9.1"
  #define MAX_DEPTH 256
+ 
+ #if CHAR_SIZE == 16
+ PyObject* PYSTRING(const Char* s) {
+     PyObject* rV;
+     int len = 0;
+     len = (int) Strlen( s );
+     rV = PyUnicode_Decode( (const char*) s, len*2, "utf16", NULL);
+     return rV;
+ }
+ #endif
+ 
  static PyObject *moduleError;
  static PyObject *moduleVersion;
  static PyObject *RXPVersion;
***************
*** 115,125 ****
  			looks the same as the entity reference.\n\
  		ReturnComments = 0\n\
  			If this is set, comments are returned, otherwise they are ignored.\n\
  		CaseInsensitive = 0\n\
  		ErrorOnUndefinedElements = 0\n\
  		ErrorOnUndefinedAttributes = 0\n\
! 			If these are set and there is a DTD, references to undeclared elements\n\
! 			and attributes are an error.\n\
  		WarnOnRedefinitions = 0\n\
  			If this is on, a warning is given for redeclared elements, attributes,\n\
  			entities and notations.\n"
--- 134,147 ----
  			looks the same as the entity reference.\n\
  		ReturnComments = 0\n\
  			If this is set, comments are returned, otherwise they are ignored.\n\
+ 		ReturnProcessingInstructions = 0\n\
+ 			If this is set, processing instructions are returned, otherwise\n\
+ 			they are ignored.\n\
  		CaseInsensitive = 0\n\
  		ErrorOnUndefinedElements = 0\n\
  		ErrorOnUndefinedAttributes = 0\n\
! 			If these are set and there is a DTD, references to undeclared\n\
! 			elements and attributes are an error.\n\
  		WarnOnRedefinitions = 0\n\
  			If this is on, a warning is given for redeclared elements, attributes,\n\
  			entities and notations.\n"
***************
*** 189,194 ****
--- 211,217 ----
  	{"ErrorOnBadCharacterEntities",1},
  	{"ErrorOnUndefinedEntities",1},
  	{"ReturnComments",0},
+ 	{"ReturnProcessingInstructions",0},
  	{"CaseInsensitive",0},
  	{"ErrorOnUndefinedElements",0},
  	{"ErrorOnUndefinedAttributes",0},
***************
*** 250,260 ****
  	int		useNone = pd->none_on_empty && !a;

  	if(!useNone){
! 		PyObject	*attrs=PyDict_New(), *t;
  		for(; a; a=a->next){
! 			PyDict_SetItemString(attrs, (char*)a->definition->name,
  							t=PyString_FromString(a->value));
! 			Py_DECREF(t);
  			}
  		return attrs;
  		}
--- 273,289 ----
  	int		useNone = pd->none_on_empty && !a;

  	if(!useNone){
! 		PyObject	*attrs=PyDict_New(), *t1,*t2;
  		for(; a; a=a->next){
! 			/*PyDict_SetItemString(attrs, (char*)a->definition->name,
  							t=PyString_FromString(a->value));
! 			Py_DECREF(t);*/
! 			PyDict_SetItem(attrs,
! 				t1=PYSTRING( (Char*)a->definition->name ),
! 				t2=PYSTRING( (Char*)a->value )
! 				);
! 			Py_DECREF(t1);
! 			Py_DECREF(t2);
  			}
  		return attrs;
  		}
***************
*** 264,273 ****
  		}
  }

! static	PyObject* makeNode(ParserDetails* pd, char *name, PyObject* attr, int empty)
  {
  	PyObject	*t = PDNode_New(4);
! 	PDSetItem(t, 0, PyString_FromString(name));
  	PDSetItem(t, 1, attr);
  	if(empty && pd->none_on_empty){
  		attr = Py_None;
--- 293,302 ----
  		}
  }

! static	PyObject* makeNode(ParserDetails* pd, const Char *name, PyObject* attr, int empty)
  {
  	PyObject	*t = PDNode_New(4);
! 	PDSetItem(t, 0, PYSTRING(name));
  	PDSetItem(t, 1, attr);
  	if(empty && pd->none_on_empty){
  		attr = Py_None;
***************
*** 285,290 ****
--- 314,323 ----
  	return t;
  }

+ #if CHAR_SIZE == 16
+ Char* com_head;
+ Char* com_tail;
+ #endif

  static	int handle_bit(Parser p, XBit bit, PyObject *stack[],int *depth)
  {
***************
*** 306,313 ****
  				}

  			empty = bit->type == XBIT_empty;
! 			t = makeNode( pd, (char*)bit->element_definition->name,
! 					get_attrs(pd, bit->element_definition, bit->attributes), empty);
  			if(empty){
  				PyList_Append(PDGetItem(stack[*depth],2),t);
  				Py_DECREF(t);
--- 339,346 ----
  				}

  			empty = bit->type == XBIT_empty;
! 			t = makeNode( pd, bit->element_definition->name,
! 				get_attrs(pd, bit->element_definition, bit->attributes), empty);
  			if(empty){
  				PyList_Append(PDGetItem(stack[*depth],2),t);
  				Py_DECREF(t);
***************
*** 329,346 ****
  			Py_DECREF(t);
  			break;
  		case XBIT_pi:
! #if			0
! 			bit->pi_name;
! 			bit->pi_chars;
! #endif
  			break;
  		case XBIT_pcdata:
! 			t = PyString_FromString(bit->pcdata_chars);
  			PyList_Append(PDGetItem(stack[*depth],2),t);
  			Py_DECREF(t);
  			break;
  		case XBIT_cdsect:
! 			t = PyString_FromString(bit->cdsect_chars);
  			PyList_Append(PDGetItem(stack[*depth],2),t);
  			Py_DECREF(t);
  			break;
--- 362,395 ----
  			Py_DECREF(t);
  			break;
  		case XBIT_pi:
! 			if(ParserGetFlag(p,ReturnProcessingInstructions)){
! 				Char* c = (Char*)PyMem_Malloc(
! 					(Strlen(bit->pi_name) + Strlen(bit->pi_chars) + 6)*2
! 					);
! 				Char* z = strdup_char8_to_Char("<?");
! 				Strcpy(c,z);
! 				free(z);
! 				Strcat(c,bit->pi_name);
! 				z = strdup_char8_to_Char(" ");
! 				Strcat(c,z);
! 				free(z);
! 				Strcat(c,bit->pi_chars);
! 				z = strdup_char8_to_Char("?>");
! 				Strcat(c,z);
! 				free(z);
! 				t = PYSTRING(c);
! 				PyList_Append(PDGetItem(stack[*depth],2),t);
! 				Py_DECREF(t);
! 				PyMem_Free(c);
! 				}
  			break;
  		case XBIT_pcdata:
! 			t = PYSTRING(bit->pcdata_chars);
  			PyList_Append(PDGetItem(stack[*depth],2),t);
  			Py_DECREF(t);
  			break;
  		case XBIT_cdsect:
! 			t = PYSTRING(bit->cdsect_chars);
  			PyList_Append(PDGetItem(stack[*depth],2),t);
  			Py_DECREF(t);
  			break;
***************
*** 348,362 ****
  			break;
  		case XBIT_comment:
  			if(ParserGetFlag(p,ReturnComments)){
  				char* c = (char*)PyMem_Malloc(strlen(bit->comment_chars)+8);
  				strcpy(c,"<!--");
  				strcat(c,bit->comment_chars);
  				strcat(c,"-->");
! 				t = PyString_FromString(c);
  				PyList_Append(PDGetItem(stack[*depth],2),t);
  				Py_DECREF(t);
  				PyMem_Free(c);
  				}
  			break;
  		default:
  			Fprintf(Stderr, "\nUnknown event type %s\n", XBitTypeName[bit->type]);
--- 397,420 ----
  			break;
  		case XBIT_comment:
  			if(ParserGetFlag(p,ReturnComments)){
+ #if CHAR_SIZE == 8
  				char* c = (char*)PyMem_Malloc(strlen(bit->comment_chars)+8);
  				strcpy(c,"<!--");
  				strcat(c,bit->comment_chars);
  				strcat(c,"-->");
! #elif CHAR_SIZE == 16
! 				Char* c = (Char*)PyMem_Malloc(Strlen(bit->comment_chars)*2+16);
! 				Strcpy(c,com_head);
! 				Strcat(c,bit->comment_chars);
! 				Strcat(c,com_tail);
! #endif
! 				t = PYSTRING(c);
  				PyList_Append(PDGetItem(stack[*depth],2),t);
  				Py_DECREF(t);
  				PyMem_Free(c);
+ 
  				}
+ 
  			break;
  		default:
  			Fprintf(Stderr, "\nUnknown event type %s\n", XBitTypeName[bit->type]);
***************
*** 376,382 ****
  	if(e->type==ET_external){
  		PyObject		*arglist;
  		PyObject		*result;
! 		arglist = Py_BuildValue("(s)",e->systemid);
  		result = PyEval_CallObject(eoCB, arglist);
  		if(result){
  			if(PyString_Check(result)){
--- 434,440 ----
  	if(e->type==ET_external){
  		PyObject		*arglist;
  		PyObject		*result;
! 		arglist = Py_BuildValue("(s)",e->systemid);/* NB. 8 bit */
  		result = PyEval_CallObject(eoCB, arglist);
  		if(result){
  			if(PyString_Check(result)){
***************
*** 399,413 ****
--- 457,484 ----
  }

  void PyErr_FromStderr(Parser p, char *msg){
+ 	/* Yech. This appears to be pulling the error messages
+ 		from the internals of RXP's Stderr. */
  	struct _FILE16 {
  		void *handle;
  		int handle2, handle3;
  		};
+ #if CHAR_SIZE == 8
  	char *buf=((struct _FILE16*)Stderr)->handle;
  	if(p->errbuf) Fprintf(Stderr,"%s\n", p->errbuf);
  	Fprintf(Stderr,"%s\n", msg);
  	buf[((struct _FILE16*)Stderr)->handle2] = 0;
  	PyErr_SetString(moduleError,buf);
+ #else
+ 	Char *buf=((struct _FILE16*)Stderr)->handle;
+ 	if(p->errbuf) Fprintf(Stderr,"%s\n", p->errbuf);
+ 	Fprintf(Stderr,"%s\n", msg);
+ 	buf[((struct _FILE16*)Stderr)->handle2 / 2] = 0;
+ 	buf[(((struct _FILE16*)Stderr)->handle2 / 2) + 1] = 0;
+ 	PyObject* t = PYSTRING(buf);
+ 	PyErr_SetObject(moduleError,t);
+ 	Py_DECREF(t);
+ #endif
  }

  /*return non zero for error*/
***************
*** 425,431 ****
  		}

  	depth = 0;
! 	stack[0] = makeNode( pd, "", Py_None, 0);	/*stealing a reference to Py_None*/
  	Py_INCREF(Py_None);					/*so we must correct for it*/
  	while(1){
  		XBitType bt;
--- 496,502 ----
  		}

  	depth = 0;
! 	stack[0] = makeNode( pd,(const Char*)"", Py_None, 0);	/*stealing a reference to Py_None*/
  	Py_INCREF(Py_None);					/*so we must correct for it*/
  	while(1){
  		XBitType bt;
***************
*** 487,492 ****
--- 558,564 ----
  	str = MakeFILE16FromString(buf,sizeof(buf)-1,"w");
  	_ParserPerror(str, pd->p, bit);
  	Fclose(str);
+ 	/* TODO: This probably needs to be unicode as well */
  	arglist = Py_BuildValue("(s)",buf);
  	result = PyEval_CallObject(pd->warnCB, arglist);
  	Py_DECREF(arglist);
***************
*** 675,681 ****
  	int	i;
  	if(!strcmp(name,"warnCB")) return _get_OB(name,self->warnCB);
  	else if(!strcmp(name,"eoCB")) return _get_OB(name,self->eoCB);
! 	else if(!strcmp(name,"fourth")) return _get_OB(name,self->eoCB);
  	else if(!strcmp(name,"srcName")){
  		Py_INCREF(self->srcName);
  		return self->srcName;
--- 747,753 ----
  	int	i;
  	if(!strcmp(name,"warnCB")) return _get_OB(name,self->warnCB);
  	else if(!strcmp(name,"eoCB")) return _get_OB(name,self->eoCB);
! 	else if(!strcmp(name,"fourth")) return _get_OB(name,self->fourth);
  	else if(!strcmp(name,"srcName")){
  		Py_INCREF(self->srcName);
  		return self->srcName;
***************
*** 755,765 ****
  }

  static struct PyMethodDef moduleMethods[] = {
! 	{"Parser",	(PyCFunction)pyRXPParser,	METH_VARARGS|METH_KEYWORDS, "Parser(*kw) create a pyRXP parser instance"},
  	{NULL,	NULL}	/*sentinel*/
  };

  DL_EXPORT(void) initpyRXP(void)
  {
  	PyObject *m, *d, *v, *t;
  	int	i;
--- 827,844 ----
  }

  static struct PyMethodDef moduleMethods[] = {
! 	{"Parser",	(PyCFunction)pyRXPParser,	
! 		METH_VARARGS|METH_KEYWORDS, 
! 		"Parser(*kw) create a pyRXP parser instance"
! 		},
  	{NULL,	NULL}	/*sentinel*/
  };

+ #if CHAR_SIZE == 16
+ DL_EXPORT(void) inituRXP(void)
+ #elif CHAR_SIZE == 8
  DL_EXPORT(void) initpyRXP(void)
+ #endif
  {
  	PyObject *m, *d, *v, *t;
  	int	i;
***************
*** 777,786 ****
  	RXPVersion = PyString_FromString(rxp_version_string);
  	PyDict_SetItemString(d, "RXPVersion", RXPVersion );
  	moduleError = PyErr_NewException(MODULE ".Error",NULL,NULL);
! 	PyDict_SetItemString(d,"error",moduleError);
  	parser_flags = PyDict_New();
  	for(i=0;flag_vals[i].k;i++){
! 		PyDict_SetItemString(parser_flags, flag_vals[i].k, t=PyInt_FromLong(flag_vals[i].v));
  		Py_DECREF(t);
  		}
  	PyDict_SetItemString(d,"parser_flags",parser_flags);
--- 856,866 ----
  	RXPVersion = PyString_FromString(rxp_version_string);
  	PyDict_SetItemString(d, "RXPVersion", RXPVersion );
  	moduleError = PyErr_NewException(MODULE ".Error",NULL,NULL);
! 	PyDict_SetItemString(d,"error",moduleError); 
  	parser_flags = PyDict_New();
  	for(i=0;flag_vals[i].k;i++){
! 		PyDict_SetItemString(parser_flags, flag_vals[i].k, 
!             t=PyInt_FromLong(flag_vals[i].v));
  		Py_DECREF(t);
  		}
  	PyDict_SetItemString(d,"parser_flags",parser_flags);
***************
*** 789,792 ****
--- 869,880 ----
  	v = PyString_FromString(moduleDoc);
  	PyDict_SetItemString(d, "__doc__", v);
  	Py_DECREF(v);
+ 
+ #if CHAR_SIZE == 16
+ 	com_head = (Char*)PyMem_Malloc(10);
+ 	com_tail = (Char*)PyMem_Malloc(8);
+ 	char8_to_Char("<!--",com_head);
+ 	char8_to_Char("-->",com_tail);
+ #endif
+ 
  }
Index: setup.py
===================================================================
RCS file: /cvsroot/reportlab/rl_addons/pyRXP/setup.py,v
retrieving revision 1.6
diff -c -c -r1.6 setup.py
*** setup.py	3 May 2002 10:20:22 -0000	1.6
--- setup.py	15 Feb 2003 00:41:05 -0000
***************
*** 4,11 ****
--- 4,18 ----
  #history http://cvs.sourceforge.net/cgi-bin/cvsweb.cgi/rl_addons/pyRXP/setup.py?cvsroot=reportlab
  #$Header: /cvsroot/reportlab/rl_addons/pyRXP/setup.py,v 1.6 2002/05/03 10:20:22 rgbecker Exp $
  if __name__=='__main__': #NO RUNTESTS
+ 
  	import os, sys
+ 	import shutil
  	from distutils.core import setup, Extension
+     
+     # patch distutils if it can't cope with the "classifiers" keyword
+ 	if sys.version < '2.2.3':
+ 		from distutils.dist import DistributionMetadata
+ 		DistributionMetadata.classifiers = None

  	def raiseConfigError(msg):
  		import exceptions 
***************
*** 13,23 ****
  			pass 
  		raise ConfigError(msg)

  	RXPDIR='rxp'
  	RXPLIBSOURCES=[]
! 	for f in ('xmlparser.c', 'url.c', 'charset.c', 'string16.c', 'ctype16.c', 'dtd.c',
! 			'input.c', 'stdio16.c', 'system.c', 'hash.c', 'version.c', 'namespaces.c', 'http.c'):
! 		RXPLIBSOURCES.append(os.path.join(RXPDIR,f))

  	if sys.platform=="win32":
  		LIBS=['wsock32']
--- 20,46 ----
  			pass 
  		raise ConfigError(msg)

+ 	# We copy the rxp source - we need to build it a second time for uRXP
+ 	# with different compile time flags
+ 	if os.path.exists('_uRXP'):
+ 		shutil.rmtree('_uRXP')
+ 	os.makedirs('_uRXP')
+ 
  	RXPDIR='rxp'
+ 	uRXPDIR='_uRXP'
  	RXPLIBSOURCES=[]
! 	uRXPLIBSOURCES=[]
! 	for f in ('xmlparser.c', 'url.c', 'charset.c', 'string16.c', 'ctype16.c', 
!                 'dtd.c', 'input.c', 'stdio16.c', 'system.c', 'hash.c', 
!                 'version.c', 'namespaces.c', 'http.c'):
! 		RXP_file = os.path.join(RXPDIR,f)
! 		uRXP_file = os.path.join(uRXPDIR,f)
! 		RXPLIBSOURCES.append(RXP_file)
! 		shutil.copy2(RXP_file,uRXP_file)
! 		uRXPLIBSOURCES.append(uRXP_file)
! 	uRXP_c = os.path.join(uRXPDIR,'uRXP.c')
! 	shutil.copy2('pyRXP.c',uRXP_c)
! 	uRXPLIBSOURCES.append(uRXP_c)

  	if sys.platform=="win32":
  		LIBS=['wsock32']
***************
*** 30,61 ****
  	else:
  		msg = "Don't know about system %s" % sys.platform
  		if int(os.environ.get('LIBERROR',1)): 
! 			raiseConfigError(msg+'\nset environment LIBERROR=0 to try no extra libs')
  		else:
  			print msg
  			LIBS=[]

- 
  	setup(	name = "pyRXP",
! 			version = "0.5",
! 			description = "Python RXP interface",
  			author = "Robin Becker",
  			author_email = "robin@reportlab.com",
  			url = "http://www.reportlab.com",
  			packages = [],
  			ext_modules = 	[Extension(	'pyRXP',
! 										['pyRXP.c']+RXPLIBSOURCES,
  										include_dirs=[RXPDIR],
! 										define_macros=[('CHAR_SIZE', 8)],
  										library_dirs=[],
- 
  										# libraries to link against
  										libraries=LIBS,
  										),
! 							]
  			)

! 	if sys.platform=='win32' and ('install' in sys.argv or 'install_ext' in sys.argv):
  		def MovePYDs(*F):
  			for x in sys.argv:
  				if x[:18]=='--install-platlib=': return
--- 53,112 ----
  	else:
  		msg = "Don't know about system %s" % sys.platform
  		if int(os.environ.get('LIBERROR',1)): 
! 			raiseConfigError(
!                 msg+'\nset environment LIBERROR=0 to try no extra libs'
!                 )
  		else:
  			print msg
  			LIBS=[]

  	setup(	name = "pyRXP",
! 			version = "0.9.1",
! 			description = "Python RXP interface - fast validating XML parser",
  			author = "Robin Becker",
  			author_email = "robin@reportlab.com",
  			url = "http://www.reportlab.com",
  			packages = [],
  			ext_modules = 	[Extension(	'pyRXP',
! 										['oldRXP.c']+RXPLIBSOURCES,
  										include_dirs=[RXPDIR],
! 										define_macros=[
!                                             ('CHAR_SIZE', 8),
!                                             ],
! 										library_dirs=[],
! 										# libraries to link against
! 										libraries=LIBS,
! 										),
! 							Extension(	'uRXP',
! 										uRXPLIBSOURCES,
! 										include_dirs=[RXPDIR],
! 										define_macros=[
!                                             ('CHAR_SIZE', 16),
!                                             ],
  										library_dirs=[],
  										# libraries to link against
  										libraries=LIBS,
  										),
! 							],
! 			license = open(os.path.join('rxp','COPYING')).read(),
!             classifiers = [
! 				'Development Status :: 5 - Production/Stable',
! 				'Intended Audience :: Developers',
! 				'License :: OSI Approved :: GNU General Public License (GPL)',
! 				'Programming Language :: Python',
! 				'Programming Language :: C',
! 				'Operating System :: Unix',
! 				'Operating System :: POSIX',
! 				'Operating System :: Microsoft :: Windows',
! 				'Topic :: Software Development :: Libraries :: Python Modules',
! 				'Topic :: Text Processing :: Markup :: XML',
!                 ]
  			)
+ 	#if os.path.exists('_uRXP'):
+ 	#	shutil.rmtree('_uRXP')

! 	if sys.platform=='win32' and ('install' in sys.argv 
!             or 'install_ext' in sys.argv):
  		def MovePYDs(*F):
  			for x in sys.argv:
  				if x[:18]=='--install-platlib=': return
***************
*** 71,73 ****
--- 122,125 ----
  				os.rename(srcf,dstf)
  				print 'Renaming %s to %s' % (srcf, dstf)
  		MovePYDs('pyRXP.pyd',)
+ 		MovePYDs('uRXP.pyd',)
Index: examples/benchmarks.py
===================================================================
RCS file: /cvsroot/reportlab/rl_addons/pyRXP/examples/benchmarks.py,v
retrieving revision 1.1
diff -c -c -r1.1 benchmarks.py
*** examples/benchmarks.py	29 Apr 2002 13:54:15 -0000	1.1
--- examples/benchmarks.py	15 Feb 2003 00:41:05 -0000
***************
*** 15,20 ****
--- 15,22 ----
  import string
  from types import TupleType
  import cStringIO
+ import os
+ import os.path

  def tupleTreeStats(node):
      # counts tags and attributes recursively
***************
*** 47,52 ****
--- 49,70 ----
  def parseWithPyRXP(parser, rawdata):
      return parser.parse(rawdata)

+ ###  uRXP - Unicode version of pyRXP
+ 
+ def getuRXPParser():
+     import uRXP
+     p = uRXP.Parser()
+     return p
+ 
+ def getNonValidatinguRXPParser():
+     import uRXP
+     p = uRXP.Parser(Validate=0)
+     return p
+ 
+ def parseWithuRXP(parser, rawdata):
+     return parser.parse(rawdata)
+ 
+ 
  ###  rparsexml - Aaron's very fast pure python parser

  def loadRparseXML():
***************
*** 130,135 ****
--- 148,156 ----
      # function to parse; function to do stats
      ('pyRXP', getPyRXPParser, parseWithPyRXP, tupleTreeStats),
      ('pyRXP_nonvalidating', getNonValidatingPyRXPParser, parseWithPyRXP, tupleTreeStats),
+     ('uRXP', getuRXPParser, parseWithuRXP, tupleTreeStats),
+     ('uRXP_nonvalidating', getNonValidatinguRXPParser, parseWithuRXP, 
+                 tupleTreeStats),
      ('rparsexml', loadRparseXML, parseWithRParseXML, tupleTreeStats),
      ('expat', getExpatParser, parseWithExpat, tupleTreeStats),
      ('minidom', loadMiniDOM, parseWithMiniDOM, statsWithMiniDOM),
Index: rxp/xmlparser.h
===================================================================
RCS file: /cvsroot/reportlab/rl_addons/pyRXP/rxp/xmlparser.h,v
retrieving revision 1.2
diff -c -c -r1.2 xmlparser.h
*** rxp/xmlparser.h	22 Mar 2002 11:00:37 -0000	1.2
--- rxp/xmlparser.h	15 Feb 2003 00:41:07 -0000
***************
*** 103,108 ****
--- 103,109 ----
      ErrorOnBadCharacterEntities,
      ErrorOnUndefinedEntities,
      ReturnComments,
+     ReturnProcessingInstructions,
      CaseInsensitive,
      ErrorOnUndefinedElements,
      ErrorOnUndefinedAttributes,

--Apple-Mail-4-397099313
Content-Disposition: attachment;
	filename=test_uRXP.py
Content-Transfer-Encoding: 7bit
Content-Type: application/octet-stream;
	x-unix-mode=0664;
	name="test_uRXP.py"

#!/usr/bin/env python
'''
$Id$
'''

__rcs_id__  = '$Id$'
__version__ = '$Revision: 0.0 $'[11:-2]
__author__ = 'Stuart Bishop <stuart@stuartbishop.net>'

import unittest
import uRXP

class test_uRXP(unittest.TestCase):
    def setUp(self):
        self.parser = uRXP.Parser()
        self.parse = self.parser.parse

    def test_DefaultExceptionHandling(self):
        xml = '''<foo>'''
        try:
            self.parse(xml,SimpleErrorFormat = 0)
        except uRXP.error,x:
            if str(x).startswith('Error: Document ends too soon'):
                pass
            else:
                self.fail("Bad SimpleErrorFormat message: %r (%r)" % (x,str(x)))

        try:
            self.parse(xml,SimpleErrorFormat = 1)
        except uRXP.error,x:
            if str(x).startswith('[unknown]:1:6: Document ends too soon'):
                pass
            else:
                self.fail("Bad SimpleErrorFormat message: %r (%r)" % (x,str(x)))

    def test_ExpandDecimalEntities(self):
        xml = '''<theory>e = m c&#178;</theory>'''
        r = self.parse(xml)
        self.assertEqual(r[2][0],u'e = m c\xb2')

    def test_ExpandHexEntities(self):
        xml = '''<theory>e = m c&#xB2;</theory>'''
        r = self.parse(xml)
        self.assertEqual(r[2][0],u'e = m c\xb2')

    def test_ExpandAttributeEntities(self):
        xml = '''<currency name="pound" symbol="&#8356;"/>'''
        r = self.parse(xml)
        self.assertEqual(r[1]['symbol'],u'\u20a4')

    def test_Comments(self):
        xml = '''<a><!-- Comment --></a>'''
        self.parser.ReturnComments = 1
        r = self.parse(xml)
        self.assertEqual(r[2][0],u'<!-- Comment -->')

    def test_EntitiesInComments(self):
        # Entities should not get expanded
        self.parser.ReturnComments = 1
        xml = '''<a><!-- &#xB2; --></a>'''
        r = self.parse(xml)
        self.assertEqual(r[2][0],u'<!-- &#xB2; -->')

    def test_ProcessingInstructions(self):
        xml = '''<a><?some junk?></a>'''
        self.parser.ReturnProcessingInstructions = 1
        r = self.parse(xml)
        self.assertEqual(r[2][0],u'<?some junk?>')

    def test_EntitiesInProcessingInstructions(self):
        xml = '''<a><?some junk '&#xB2; ?></a>'''
        self.parser.ReturnProcessingInstructions = 1
        r = self.parse(xml)
        self.assertEqual(r[2][0],u"<?some junk '&#xB2; ?>")

    def test_Prolog(self):
        xml = '''<?xml version="1.0" encoding="utf-8"?>
                 <!DOCTYPE fixthistest>
                 <doc/>
              '''
        self.assertEqual(self.parse(xml)[0],u'doc')

        self.parser.ReturnProlog = 1
        r = self.parse(xml)
        self.assertEqual(r[0],(u'doc', None, None, None))

        self.parser.ReturnProcessingInstructions = 1
        r = self.parse(xml)
        self.assertEqual(r[0],u'<?xml version="1.0" encoding="utf-8"?><doc/>')
        self.assertEqual(r[1],(u'doc', None, None, None))

        self.parser.ReturnProcessingInstructions = 1
        self.parser.ReturnSomethingOrOtherToFix = 1
        self.assertEqual(r[0],u'<?xml version="1.0" encoding="utf-8"?><doc/>')
        self.assertEqual(r[1],u'<!DOCTYPE fixthistest>')
        self.assertEqual(r[2],(u'doc', None, None, None))

if __name__ == '__main__':
    unittest.main()

--Apple-Mail-4-397099313
Content-Disposition: attachment;
	filename=test_xmltestsuite.py
Content-Transfer-Encoding: 7bit
Content-Type: application/octet-stream;
	x-unix-mode=0644;
	name="test_xmltestsuite.py"

#!/usr/bin/env python
'''
$Id: test_xmltestsuite.py,v 1.1 2003/02/08 16:35:39 zen Exp $
Test parsing and validation against James Clark's test cases,
as downloaded from http://www.jclark.com/xml/
'''

__rcs_id__  = '$Id: test_xmltestsuite.py,v 1.1 2003/02/08 16:35:39 zen Exp $'
__version__ = '$Revision: 1.1 $'[11:-2]
__author__ = 'Stuart Bishop <stuart@stuartbishop.net>'

debug = 0

import unittest
import zipfile
import sys
import os
import os.path
import pyRXP
try:
    import uRXP
except:
    uRXP = None
import codecs
if debug: import time

# 2.2 compatibility - sort of
try:
    __file__
except NameError:
    __file__ = os.path.join(os.getcwd(),'oops')

class test_uRXP_XMLTestSuite(unittest.TestCase):
    mod = uRXP

    def setUp(self):
        self.valid = []
        self.invalid = []
        self.notwf = []
        self.testdir = os.path.dirname(__file__)
        zipf = zipfile.ZipFile(os.path.join(self.testdir,'xmltest.zip'))
        names = zipf.namelist()
        for zipname in names:
            osname = os.path.join(*zipname.split('/')) # For non-unixes
            osname = os.path.join(self.testdir,osname)
            dir = os.path.dirname(osname)
            if not os.path.isdir(dir):
                os.makedirs(dir)
            if not os.path.isfile(osname):
                f = open(osname,'wb')
                f.write(zipf.read(zipname))
                f.close()

            if zipname.find('out') == -1:
                if zipname.find('invalid') != -1:
                    self.invalid.append(osname)
                elif zipname.find('not-wf') != -1:
                    self.notwf.append(osname)
                elif zipname.find('valid') != -1:
                    outname = os.path.join(dir,'out',os.path.basename(osname))
                    self.valid.append( (osname,outname) )

    def parse(self,filename,**kw):
        if debug: print >> sys.stderr,'About to parse %s' % filename
        kw = kw.copy()
        kw['ReturnComments'] = 1
        kw['ExpandEmpty'] = 1
        #kw['ReturnProcessingInstructions'] = 1
        parser = self.mod.Parser(**kw)
        # Change directory in case we are loading entities from cwd
        retdir = os.getcwd()
        d,n = os.path.split(filename)
        os.chdir(d)
        try:
            f = open(n)
            xml = f.read()
            return parser.parse(xml)
        finally:
            try:
                f.close()
            except:
                pass
            os.chdir(retdir)
            if debug: print >> sys.stderr,'Done parsing   %s' % filename
            if debug: print >> sys.stderr,'='*60
            if debug: time.sleep(1)

    def getcanonical(self,filename):
        ''' Parse in the named file, and return it as canonical XML '''
        return self._getcan(self.parse(filename))

    def _getcan(self,node):
        if type(node) in (type(''),type(u'')):
            if node.startswith(u'<?') or node.startswith(u'<!'):
                return node
            else:
                return self._quote(node)

        tag,attrs,kids,junk = node

        if attrs is None:
            attrs = ''
        else:
            keys = attrs.keys()
            keys.sort() # Attributes in lexical order
            attrs = ' '.join(
                ['%s="%s"' % (k,self._quote(attrs[k])) for k in keys]
                )
            if attrs:
                attrs = ' ' + attrs

        text = ''.join([self._getcan(kid) for kid in kids])

        return '<%s%s>%s</%s>' % (tag,attrs,text,tag)

    def _quote(self,txt):
        txt = txt.replace('&','&amp;')
        txt = txt.replace('<','&lt;')
        txt = txt.replace('>','&gt;')
        txt = txt.replace('"','&quot;')
        txt = txt.replace('\x09','&#9;')
        txt = txt.replace('\x0a','&#10;')
        txt = txt.replace('\x0d','&#13;')
        return txt

    def test_valid(self):
        for inname,outname in self.valid:
            inxml = self.getcanonical(inname)
            f = codecs.open(outname,mode='r',encoding='utf8')
            outxml = f.read()
            f.close()
            self.assertEqual(inxml,outxml)

    def test_invalid_parse(self):
        for inname in self.invalid:
            try:
                self.parse(inname,Validate=0)
            except self.mod.error,x:
                self.fail('Failed to parse %r in non-validating mode' % inname)

    def test_invalid_validate(self):
        for inname in self.invalid:
            try:
                self.parse(inname,Validate=1)
                self.fail('Failed to detect validity error in %r' % inname)
            except self.mod.error:
                pass

    def test_notwf(self):
        for inname in self.notwf:
            try:
                self.parse(inname,Validate=0)
                self.fail('Failed to detect well-formedness in %r' % inname)
            except self.mod.error:
                pass

class test_pyRXP_XMLTestSuite(test_uRXP_XMLTestSuite):
    mod = pyRXP

if __name__ == '__main__':
    unittest.main()

--Apple-Mail-4-397099313
Content-Transfer-Encoding: 7bit
Content-Type: text/plain;
	charset=US-ASCII;
	format=flowed

-- 
Stuart Bishop <zen@shangri-la.dropbear.id.au>
http://shangri-la.dropbear.id.au/

--Apple-Mail-4-397099313--