[reportlab-users] re: Unicode pyRXP and malloc problems in pyRXP
Stuart Bishop
reportlab-users@reportlab.com
Wed, 19 Feb 2003 10:22:30 +1100
--Apple-Mail-2-731357005
Content-Transfer-Encoding: 7bit
Content-Type: text/plain;
charset=US-ASCII;
format=flowed
On Wednesday, February 19, 2003, at 01:20 AM, Robin Becker wrote:
> Stuart my patch throws up rather horribly at the cvs diff patch. Can
> you
> send me the files themselves?
>
> That is pyRXP.c, setup.py, benchmarks.py & xmlparser.h.
--Apple-Mail-2-731357005
Content-Disposition: attachment;
filename=setup.py
Content-Transfer-Encoding: 7bit
Content-Type: application/octet-stream;
x-unix-mode=0644;
name="setup.py"
#!/usr/bin/env python
#copyright ReportLab Inc. 2000-2002
#see license.txt for license details
#history http://cvs.sourceforge.net/cgi-bin/cvsweb.cgi/rl_addons/pyRXP/setup.py?cvsroot=reportlab
#$Header: /cvsroot/reportlab/rl_addons/pyRXP/setup.py,v 1.6 2002/05/03 10:20:22 rgbecker Exp $
if __name__=='__main__': #NO RUNTESTS
import os, sys
import shutil
from distutils.core import setup, Extension
# patch distutils if it can't cope with the "classifiers" keyword
if sys.version < '2.2.3':
from distutils.dist import DistributionMetadata
DistributionMetadata.classifiers = None
def raiseConfigError(msg):
import exceptions
class ConfigError(exceptions.Exception):
pass
raise ConfigError(msg)
# We copy the rxp source - we need to build it a second time for uRXP
# with different compile time flags
if os.path.exists('_uRXP'):
shutil.rmtree('_uRXP')
os.makedirs('_uRXP')
RXPDIR='rxp'
uRXPDIR='_uRXP'
RXPLIBSOURCES=[]
uRXPLIBSOURCES=[]
for f in ('xmlparser.c', 'url.c', 'charset.c', 'string16.c', 'ctype16.c',
'dtd.c', 'input.c', 'stdio16.c', 'system.c', 'hash.c',
'version.c', 'namespaces.c', 'http.c'):
RXP_file = os.path.join(RXPDIR,f)
uRXP_file = os.path.join(uRXPDIR,f)
RXPLIBSOURCES.append(RXP_file)
shutil.copy2(RXP_file,uRXP_file)
uRXPLIBSOURCES.append(uRXP_file)
uRXP_c = os.path.join(uRXPDIR,'uRXP.c')
shutil.copy2('pyRXP.c',uRXP_c)
uRXPLIBSOURCES.append(uRXP_c)
if sys.platform=="win32":
LIBS=['wsock32']
elif sys.platform=="sunos5":
LIBS=['nsl', 'socket', 'dl']
elif sys.platform=="aix4":
LIBS=['nsl_r', 'dl']
elif sys.platform in ("freebsd4", "darwin", "mac", "linux2"):
LIBS=[]
else:
msg = "Don't know about system %s" % sys.platform
if int(os.environ.get('LIBERROR',1)):
raiseConfigError(
msg+'\nset environment LIBERROR=0 to try no extra libs'
)
else:
print msg
LIBS=[]
setup( name = "pyRXP",
version = "0.9.1",
description = "Python RXP interface - fast validating XML parser",
author = "Robin Becker",
author_email = "robin@reportlab.com",
url = "http://www.reportlab.com",
packages = [],
ext_modules = [Extension( 'pyRXP',
['oldRXP.c']+RXPLIBSOURCES,
include_dirs=[RXPDIR],
define_macros=[
('CHAR_SIZE', 8),
],
library_dirs=[],
# libraries to link against
libraries=LIBS,
),
Extension( 'uRXP',
uRXPLIBSOURCES,
include_dirs=[RXPDIR],
define_macros=[
('CHAR_SIZE', 16),
],
library_dirs=[],
# libraries to link against
libraries=LIBS,
),
],
license = open(os.path.join('rxp','COPYING')).read(),
classifiers = [
'Development Status :: 5 - Production/Stable',
'Intended Audience :: Developers',
'License :: OSI Approved :: GNU General Public License (GPL)',
'Programming Language :: Python',
'Programming Language :: C',
'Operating System :: Unix',
'Operating System :: POSIX',
'Operating System :: Microsoft :: Windows',
'Topic :: Software Development :: Libraries :: Python Modules',
'Topic :: Text Processing :: Markup :: XML',
]
)
#if os.path.exists('_uRXP'):
# shutil.rmtree('_uRXP')
if sys.platform=='win32' and ('install' in sys.argv
or 'install_ext' in sys.argv):
def MovePYDs(*F):
for x in sys.argv:
if x[:18]=='--install-platlib=': return
src = sys.exec_prefix
dst = os.path.join(src,'DLLs')
if sys.hexversion>=0x20200a0:
src = os.path.join(src,'Lib','site-packages')
for f in F:
dstf = os.path.join(dst,f)
if os.path.isfile(dstf):
os.remove(dstf)
srcf = os.path.join(src,f)
os.rename(srcf,dstf)
print 'Renaming %s to %s' % (srcf, dstf)
MovePYDs('pyRXP.pyd',)
MovePYDs('uRXP.pyd',)
--Apple-Mail-2-731357005
Content-Disposition: attachment;
filename=pyRXP.c
Content-Transfer-Encoding: 7bit
Content-Type: text/plain;
x-unix-mode=0664;
name="pyRXP.c"
/****************************************************************************
#copyright ReportLab Inc. 2000
#see license.txt for license details
#history http://cvs.sourceforge.net/cgi-bin/cvsweb.cgi/rl_addons/pyRXP/pyRXP.c?cvsroot=reportlab
#$Header: /cvsroot/reportlab/rl_addons/pyRXP/pyRXP.c,v 1.10 2002/10/25 15:27:40 rgbecker Exp $
****************************************************************************/
static char* __version__=" $Id: pyRXP.c,v 1.10 2002/10/25 15:27:40 rgbecker Exp $ ";
#include <Python.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#ifndef CHAR_SIZE
#error CHAR_SIZE not specified
#endif
#if CHAR_SIZE == 16
#define MODULE "uRXP"
#elif CHAR_SIZE == 8
#define PYSTRING(s) PyString_FromString(s)
#define MODULE "pyRXP"
#else
#error Invalid CHAR_SIZE specified
#endif
#include "system.h"
#include "ctype16.h"
#include "charset.h"
#include "string16.h"
#include "dtd.h"
#include "input.h"
#include "xmlparser.h"
#include "stdio16.h"
#include "version.h"
#include "namespaces.h"
#define VERSION "0.9.1"
#define MAX_DEPTH 256
#if CHAR_SIZE == 16
PyObject* PYSTRING(const Char* s) {
PyObject* rV;
int len = 0;
len = (int) Strlen( s );
rV = PyUnicode_Decode( (const char*) s, len*2, "utf16", NULL);
return rV;
}
#endif
static PyObject *moduleError;
static PyObject *moduleVersion;
static PyObject *RXPVersion;
static PyObject *parser_flags;
static char *moduleDoc =
"\n\
This is pyRXP a python wrapper for RXP, a validating namespace-aware XML parser\n\
in C.\n\
\n\
RXP was written by Richard Tobin at the Language Technology Group,\n\
Human Communication Research Centre, University of Edinburgh.\n\
\n\
RXP is distributed under the GNU Public Licence, which is in the file\n\
COPYING. RXP may be made available under other licensing terms;\n\
contact M.Moens@ed.ac.uk for details.\n\
\n\
RXP is based on the W3C XML 1.0 recommendation of 10th February 1998\n\
and the Namespaces recommendation of 14th January 1999. Deviations\n\
from these recommendations should probably be considered as bugs.\n\
\n\
Interface summary:\n\
\n\
The python module exports the following\n\
error a python exception\n\
version the string version of the module\n\
RXPVersion the version string of the rxp library\n\
embedded in the module\n\
parser_flags a dictionary of parser flags\n\
the values are the defaults for parsers\n\
\n\
\n\
Parser(*kw) Create a parser\n\
\n\
\n\
Parser Attributes and Methods\n\
parse(src,**kw)\n\
The main interface to the parser. It returns Aaron Watter's\n\
radxml encoding of the xml src.\n\
The string src contains the xml.\n\
The keyword arguments can modify the instance attributes\n\
for this call only.\n\
The __call__ attribute of Parser instances is equivalent to\n\
the parse attribute.\n\
\n\
srcName '[unknown]', name used to refer to the parser src\n\
in error and warning messages.\n\
\n\
warnCB 0, should either be None, 0, or a\n\
callable method with a single argument which will\n\
receive warning messages. If None is used then warnings\n\
are thrown away. If the default 0 value is used then\n\
warnings are written to the internal error message buffer\n\
and will only be seen if an error occurs.\n\
\n\
eoCB argument should be None or a callable method with\n\
a single argument. This method will be called when external\n\
entities are opened. The method should return a possibly\n\
modified URI.\n\
\n"
" fourth argument should be None (default) or a callable method with\n\
no arguments. If callable, will be called to get or generate the\n\
4th item of every 4-item tuple or list in the returned tree\n\
\n\
Flag attributes corresponding to the rxp flags;\n\
the values are the module standard defaults.\n\
ExpandCharacterEntities = 1\n\
ExpandGeneralEntities = 1\n\
If these are set, entity references are expanded. If not, the\n\
references are treated as text, in which case any text returned that\n\
starts with an ampersand must be an entity reference (and provided\n\
MergePCData is off, all entity references will be returned as separate\n\
pieces).\n\
XMLSyntax = 1\n\
XMLPredefinedEntities = 1\n\
ErrorOnUnquotedAttributeValues = 1\n\
NormaliseAttributeValues = 1\n\
If this is set, attributes are normalised according to the standard.\n\
You might want to not normalise if you are writing something like an\n\
editor.\n\
ErrorOnBadCharacterEntities = 1\n\
If this is set, character entities which expand to illegal values are\n\
an error, otherwise they are ignored with a warning.\n\
ErrorOnUndefinedEntities = 1\n\
If this is set, undefined general entity references are an error,\n\
otherwise a warning is given and a fake entity constructed whose value\n\
looks the same as the entity reference.\n\
ReturnComments = 0\n\
If this is set, comments are returned, otherwise they are ignored.\n\
ReturnProcessingInstructions = 0\n\
If this is set, processing instructions are returned, otherwise\n\
they are ignored.\n\
CaseInsensitive = 0\n\
ErrorOnUndefinedElements = 0\n\
ErrorOnUndefinedAttributes = 0\n\
If these are set and there is a DTD, references to undeclared\n\
elements and attributes are an error.\n\
WarnOnRedefinitions = 0\n\
If this is on, a warning is given for redeclared elements, attributes,\n\
entities and notations.\n"
" TrustSDD = 1\n\
ProcessDTD = 0\n\
If TrustSDD is set and a DOCTYPE declaration is present, the internal\n\
part is processed and if the document was not declared standalone or\n\
if Validate is set the external part is processed. Otherwise, whether\n\
the DOCTYPE is automatically processed depends on ProcessDTD; if\n\
ProcessDTD is not set the user must call ParseDtd() if desired.\n\
XMLExternalIDs = 1\n\
ReturnDefaultedAttributes = 1\n\
If this is set, the returned attributes will include ones defaulted as\n\
a result of ATTLIST declarations, otherwise missing attributes will not\n\
be returned.\n\
MergePCData = 1\n\
If this is set, text data will be merged across comments and entity\n\
references.\n\
XMLMiscWFErrors = 1\n\
XMLStrictWFErrors = 1\n\
If this is set, various well-formedness errors will be reported as errors\n\
rather than warnings.\n\
AllowMultipleElements = 0\n\
MaintainElementStack = 1\n\
IgnoreEntities = 0\n\
XMLLessThan = 0\n\
IgnorePlacementErrors = 0\n"
" Validate = 1\n\
If this is on, the parser will validate the document.\n\
ErrorOnValidityErrors = 1\n\
If this is on, validity errors will be reported as errors rather than\n\
warnings. This is useful if your program wants to rely on the\n\
validity of its input.\n\
XMLSpace = 0\n\
If this is on, the parser will keep track of xml:space attributes\n\
XMLNamespaces = 0\n\
If this is on, the parser processes namespace declarations (see\n\
below). Namespace declarations are *not* returned as part of the list\n\
of attributes on an element.\n\
NoNoDTDWarning = 1\n\
Usually, if Validate is set, the parser will produce a warning if the\n\
document has no DTD. This flag suppresses the warning (useful if you\n\
want to validate if possible, but not complain if not).\n\
SimpleErrorFormat = 0\n\
AllowUndeclaredNSAttributes = 0\n\
RelaxedAny = 0\n\
ReturnNamespaceAttributes = 0\n\
ReturnList = 0\n\
Usually we discard comments and want only one tag; set this to 1 to get\n\
a list at the top level instead of a supposed singleton tag.\n\
If 0 the first tuple in the list will be returned (ie the first tag tuple).\n\
ExpandEmpty false (default) or true. If false, empty attribute dicts and\n\
empty lists of children are changed into the value None\n\
in every 4-item tuple or list in the returned tree\n\
MakeMutableTree false (default) or true. If false, nodes in the returned tree\n\
are 4-item tuples; if true, 4-item lists.\n\
";
/*alter the integer values to change the module defaults*/
static struct {char* k;long v;} flag_vals[]={
{"ExpandCharacterEntities",1},
{"ExpandGeneralEntities",1},
{"XMLSyntax",1},
{"XMLPredefinedEntities",1},
{"ErrorOnUnquotedAttributeValues",1},
{"NormaliseAttributeValues",1},
{"ErrorOnBadCharacterEntities",1},
{"ErrorOnUndefinedEntities",1},
{"ReturnComments",0},
{"ReturnProcessingInstructions",0},
{"CaseInsensitive",0},
{"ErrorOnUndefinedElements",0},
{"ErrorOnUndefinedAttributes",0},
{"WarnOnRedefinitions",0},
{"TrustSDD",1},
{"XMLExternalIDs",1},
{"ReturnDefaultedAttributes",1},
{"MergePCData",1},
{"XMLMiscWFErrors",1},
{"XMLStrictWFErrors",1},
{"AllowMultipleElements",0},
{"MaintainElementStack",1},
{"IgnoreEntities",0},
{"XMLLessThan",0},
{"IgnorePlacementErrors",0},
{"Validate",1},
{"ErrorOnValidityErrors",1},
{"XMLSpace",0},
{"XMLNamespaces",0},
{"NoNoDTDWarning",1},
{"SimpleErrorFormat",0},
{"AllowUndeclaredNSAttributes",0},
{"RelaxedAny",0},
{"ReturnNamespaceAttributes",0},
{"ProcessDTD",0},
{"ReturnList",0},
{"ExpandEmpty",0},
{"MakeMutableTree",0},
{0}};
#define LASTRXPFLAG ProcessDTD
#define ReturnList (ParserFlag)(1+(int)LASTRXPFLAG)
#define ExpandEmpty (ParserFlag)(1+(int)ReturnList)
#define MakeMutableTree (ParserFlag)(1+(int)ExpandEmpty)
#define __GetFlag(p, flag) \
((((flag) < 32) ? ((p)->flags[0] & (1u << (flag))) : ((p)->flags[1] & (1u << ((flag)-32))))!=0)
#ifdef _DEBUG
# define Py_REFCOUNT(op) ((op)->ob_refcnt)
#endif
typedef struct {
Parser p;
int warnCBF;
int warnErr;
PyObject* warnCB;
PyObject* eoCB;
PyObject* fourth;
PyObject* (*Node_New)(int);
int (*SetItem)(PyObject*, int, PyObject*);
PyObject* (*GetItem)(PyObject*, int);
int none_on_empty;
} ParserDetails;
#define PDGetItem pd->GetItem
#define PDSetItem pd->SetItem
#define PDNode_New pd->Node_New
static PyObject* get_attrs(ParserDetails* pd, ElementDefinition e, Attribute a)
{
int useNone = pd->none_on_empty && !a;
if(!useNone){
PyObject *attrs=PyDict_New(), *t1,*t2;
for(; a; a=a->next){
/*PyDict_SetItemString(attrs, (char*)a->definition->name,
t=PyString_FromString(a->value));
Py_DECREF(t);*/
PyDict_SetItem(attrs,
t1=PYSTRING( (Char*)a->definition->name ),
t2=PYSTRING( (Char*)a->value )
);
Py_DECREF(t1);
Py_DECREF(t2);
}
return attrs;
}
else {
Py_INCREF(Py_None);
return Py_None;
}
}
static PyObject* makeNode(ParserDetails* pd, const Char *name, PyObject* attr, int empty)
{
PyObject *t = PDNode_New(4);
PDSetItem(t, 0, PYSTRING(name));
PDSetItem(t, 1, attr);
if(empty && pd->none_on_empty){
attr = Py_None;
Py_INCREF(Py_None);
}
else
attr = PyList_New(0);
PDSetItem(t,2,attr);
if(pd->fourth && pd->fourth!=Py_None) attr = PyObject_CallObject(pd->fourth, 0);
else {
attr = Py_None;
Py_INCREF(Py_None);
}
PDSetItem(t, 3, attr);
return t;
}
#if CHAR_SIZE == 16
Char* com_head;
Char* com_tail;
#endif
static int handle_bit(Parser p, XBit bit, PyObject *stack[],int *depth)
{
int r = 0, empty;
PyObject *t;
ParserDetails* pd = (ParserDetails*)(p->callback_arg);
switch(bit->type) {
case XBIT_eof: break;
case XBIT_error:
ParserPerror(p, bit);
r = 1;
break;
case XBIT_start:
case XBIT_empty:
if(*depth==MAX_DEPTH){
Fprintf(Stderr,"Internal error, stack limit reached!\n");
r = 2;
break;
}
empty = bit->type == XBIT_empty;
t = makeNode( pd, bit->element_definition->name,
get_attrs(pd, bit->element_definition, bit->attributes), empty);
if(empty){
PyList_Append(PDGetItem(stack[*depth],2),t);
Py_DECREF(t);
}
else {
*depth = *depth + 1;
stack[*depth] = t;
}
break;
case XBIT_end:
if(*depth==0){
Fprintf(Stderr,"Internal error, stack underflow!\n");
r = 2;
break;
}
t = stack[*depth];
*depth = *depth-1;
PyList_Append(PDGetItem(stack[*depth],2),t);
Py_DECREF(t);
break;
case XBIT_pi:
if(ParserGetFlag(p,ReturnProcessingInstructions)){
Char* c = (Char*)PyMem_Malloc(
(Strlen(bit->pi_name) + Strlen(bit->pi_chars) + 6)*2
);
Char* z = strdup_char8_to_Char("<?");
Strcpy(c,z);
free(z);
Strcat(c,bit->pi_name);
z = strdup_char8_to_Char(" ");
Strcat(c,z);
free(z);
Strcat(c,bit->pi_chars);
z = strdup_char8_to_Char("?>");
Strcat(c,z);
free(z);
t = PYSTRING(c);
PyList_Append(PDGetItem(stack[*depth],2),t);
Py_DECREF(t);
PyMem_Free(c);
}
break;
case XBIT_pcdata:
t = PYSTRING(bit->pcdata_chars);
PyList_Append(PDGetItem(stack[*depth],2),t);
Py_DECREF(t);
break;
case XBIT_cdsect:
t = PYSTRING(bit->cdsect_chars);
PyList_Append(PDGetItem(stack[*depth],2),t);
Py_DECREF(t);
break;
case XBIT_dtd:
break;
case XBIT_comment:
if(ParserGetFlag(p,ReturnComments)){
#if CHAR_SIZE == 8
char* c = (char*)PyMem_Malloc(strlen(bit->comment_chars)+8);
strcpy(c,"<!--");
strcat(c,bit->comment_chars);
strcat(c,"-->");
#elif CHAR_SIZE == 16
Char* c = (Char*)PyMem_Malloc(Strlen(bit->comment_chars)*2+16);
Strcpy(c,com_head);
Strcat(c,bit->comment_chars);
Strcat(c,com_tail);
#endif
t = PYSTRING(c);
PyList_Append(PDGetItem(stack[*depth],2),t);
Py_DECREF(t);
PyMem_Free(c);
}
break;
default:
Fprintf(Stderr, "\nUnknown event type %s\n", XBitTypeName[bit->type]);
ParserPerror(p, bit);
r = 1;
break;
}
return r;
}
static InputSource entity_open(Entity e, void *info)
{
ParserDetails* pd = (ParserDetails*)info;
PyObject *eoCB = pd->eoCB;
if(e->type==ET_external){
PyObject *arglist;
PyObject *result;
arglist = Py_BuildValue("(s)",e->systemid);/* NB. 8 bit */
result = PyEval_CallObject(eoCB, arglist);
if(result){
if(PyString_Check(result)){
int i;
PyObject_Cmp(PyTuple_GET_ITEM(arglist,0),result,&i);
if(i){
/*not the same*/
Free((void*)(e->systemid));
e->systemid = strdup8(PyString_AS_STRING(result));
}
}
Py_DECREF(result);
}
else {
PyErr_Clear();
}
Py_DECREF(arglist);
}
return EntityOpen(e);
}
void PyErr_FromStderr(Parser p, char *msg){
/* Yech. This appears to be pulling the error messages
from the internals of RXP's Stderr. */
struct _FILE16 {
void *handle;
int handle2, handle3;
};
#if CHAR_SIZE == 8
char *buf=((struct _FILE16*)Stderr)->handle;
if(p->errbuf) Fprintf(Stderr,"%s\n", p->errbuf);
Fprintf(Stderr,"%s\n", msg);
buf[((struct _FILE16*)Stderr)->handle2] = 0;
PyErr_SetString(moduleError,buf);
#else
Char *buf=((struct _FILE16*)Stderr)->handle;
if(p->errbuf) Fprintf(Stderr,"%s\n", p->errbuf);
Fprintf(Stderr,"%s\n", msg);
buf[((struct _FILE16*)Stderr)->handle2 / 2] = 0;
buf[(((struct _FILE16*)Stderr)->handle2 / 2) + 1] = 0;
PyObject* t = PYSTRING(buf);
PyErr_SetObject(moduleError,t);
Py_DECREF(t);
#endif
}
/*return non zero for error*/
PyObject *ProcessSource(Parser p, InputSource source)
{
XBit bit=0;
int r, depth, i;
PyObject *stack[MAX_DEPTH];
PyObject *retVal = 0;
ParserDetails* pd = (ParserDetails*)(p->callback_arg);
if(ParserPush(p, source) == -1) {
PyErr_FromStderr(p,"Internal error, ParserPush failed!");
return NULL;
}
depth = 0;
stack[0] = makeNode( pd,(const Char*)"", Py_None, 0); /*stealing a reference to Py_None*/
Py_INCREF(Py_None); /*so we must correct for it*/
while(1){
XBitType bt;
bit = ReadXBit(p);
r = handle_bit(p, bit, stack, &depth);
bt = bit->type;
FreeXBit(bit);
if(r) break;
if (bt == XBIT_eof){
r=0;
break;
}
}
if(!r && depth==0){
PyObject* l0 = PDGetItem(stack[0],2);
Py_INCREF(l0);
Py_DECREF(stack[0]);
if(!__GetFlag(p,ReturnList)){
int n = PyList_Size(l0);
for(i=0;i<n;i++){
retVal = PyList_GetItem(l0,i);
if(PyTuple_Check(retVal)) break;
if(PyList_Check(retVal)) break;
}
if(i==n) retVal = Py_None;
Py_INCREF(retVal);
Py_DECREF(l0);
}
else {
retVal = l0;
}
PyErr_Clear();
}
else {
if(!r) PyErr_FromStderr(p,"Internal error, stack not fully popped!");
else {
Fprintf(Stderr,"error return=%d\n",r);
PyErr_FromStderr(p,"Parse Failed!");
}
for(i=0;i<=depth;i++){
Py_DECREF(stack[i]);
}
retVal = NULL;
}
return retVal;
}
static void myWarnCB(XBit bit, void *info)
{
ParserDetails* pd=(ParserDetails*)info;
PyObject *arglist;
PyObject *result;
FILE16 *str;
char buf[512];
pd->warnErr++;
if(pd->warnCB==Py_None) return;
str = MakeFILE16FromString(buf,sizeof(buf)-1,"w");
_ParserPerror(str, pd->p, bit);
Fclose(str);
/* TODO: This probably needs to be unicode as well */
arglist = Py_BuildValue("(s)",buf);
result = PyEval_CallObject(pd->warnCB, arglist);
Py_DECREF(arglist);
if(result){
Py_DECREF(result);
}
else {
pd->warnCBF++;
PyErr_Clear();
}
}
typedef struct {
PyObject_HEAD
PyObject *warnCB, *eoCB, *srcName, *fourth;
int flags[2];
} pyRXPParserObject;
static void __SetFlag(pyRXPParserObject* p, ParserFlag flag, int value)
{
int flagset;
unsigned int flagbit;
flagset = (flag >> 5);
flagbit = (1u << (flag & 31));
if(value) p->flags[flagset] |= flagbit;
else p->flags[flagset] &= ~flagbit;
}
static int _set_CB(char* name, PyObject** pCB, PyObject* value)
{
if(value!=Py_None && !PyCallable_Check(value)){
char buf[64];
sprintf(buf,"%s value must be absent, callable or None", name);
PyErr_SetString(PyExc_ValueError, buf);
return -1;
}
else {
Py_XDECREF(*pCB);
*pCB = value;
Py_INCREF(value);
return 0;
}
}
static int pyRXPParser_setattr(pyRXPParserObject *self, char *name, PyObject* value)
{
char buf[256];
PyObject* v;
int i;
if(!strcmp(name,"warnCB")) return _set_CB(name,&self->warnCB,value);
else if(!strcmp(name,"eoCB")) return _set_CB(name,&self->eoCB,value);
else if(!strcmp(name,"fourth")) return _set_CB(name,&self->fourth,value);
else if(!strcmp(name,"srcName")){
if(!PyString_Check(value)){
PyErr_SetString(PyExc_ValueError, "srcName value must be a string");
return -1;
}
else {
Py_XDECREF(self->srcName);
self->srcName = value;
Py_INCREF(value);
return 0;
}
}
else {
for(i=0;flag_vals[i].k;i++){
if(!strcmp(flag_vals[i].k,name)){
v = PyNumber_Int(value);
if(v){
__SetFlag(self,(ParserFlag)i,PyInt_AsLong(v));
Py_DECREF(v);
return 0;
}
else{
sprintf(buf,"%s value must be int", name);
PyErr_SetString(PyExc_ValueError, buf);
return -1;
}
}
}
sprintf(buf,"Unknown attribute %s", name);
PyErr_SetString(PyExc_AttributeError, buf);
return -1;
}
}
static PyObject* pyRXPParser_parse(pyRXPParserObject* xself, PyObject* args, PyObject* kw)
{
int srcLen, i;
char *src;
FILE16 *f;
InputSource source;
PyObject *retVal=NULL;
char errBuf[512];
ParserDetails CB;
Parser p;
pyRXPParserObject dummy = *xself;
pyRXPParserObject* self = &dummy;
if(self->warnCB) Py_INCREF(self->warnCB);
if(self->eoCB) Py_INCREF(self->eoCB);
if(self->fourth) Py_INCREF(self->fourth);
if(self->srcName) Py_INCREF(self->srcName);
if(!PyArg_ParseTuple(args, "s#", &src, &srcLen)) goto L_1;
if(kw){
PyObject *key, *value;
i = 0;
while(PyDict_Next(kw,&i,&key,&value))
if(pyRXPParser_setattr(self, PyString_AsString(key), value)) goto L_1;
}
if(self->warnCB){
CB.warnCB = self->warnCB;
CB.warnErr = 0;
CB.warnCBF = 0;
}
if(self->eoCB){
CB.eoCB = self->eoCB;
}
CB.fourth = self->fourth;
p = NewParser();
CB.p = p;
ParserSetCallbackArg(p, &CB);
p->flags[0] = self->flags[0];
p->flags[1] = self->flags[1];
if((self->warnCB && self->warnCB!=Py_None) || (self->eoCB && self->eoCB!=Py_None)){
if(self->warnCB && self->warnCB!=Py_None) ParserSetWarningCallback(p, myWarnCB);
if(self->eoCB && self->eoCB!=Py_None) ParserSetEntityOpener(p, entity_open);
}
CB.none_on_empty = !__GetFlag(self,ExpandEmpty);
if(__GetFlag(self,MakeMutableTree)){
CB.Node_New = PyList_New;
CB.SetItem = PyList_SetItem;
CB.GetItem = PyList_GetItem;
}
else {
CB.Node_New = PyTuple_New;
CB.SetItem = PyTuple_SetItem;
CB.GetItem = PyTuple_GetItem;
}
ParserSetFlag(p,XMLPredefinedEntities,__GetFlag(self,XMLPredefinedEntities));
/*set up the parsers Stderr stream thing so we get it in a string*/
Fclose(Stderr);
Stderr = MakeFILE16FromString(errBuf,sizeof(errBuf)-1,"w");
f = MakeFILE16FromString(src,srcLen,"r");
source = SourceFromFILE16(PyString_AsString(self->srcName),f);
retVal = ProcessSource(p,source);
FreeEntity(source->entity);
Fclose(Stderr);
FreeDtd(p->dtd);
FreeParser(p);
deinit_parser();
L_1:
Py_XDECREF(self->warnCB);
Py_XDECREF(self->eoCB);
Py_XDECREF(self->fourth);
Py_XDECREF(self->srcName);
return retVal;
}
static struct PyMethodDef pyRXPParser_methods[] = {
{"parse", (PyCFunction)pyRXPParser_parse, METH_VARARGS|METH_KEYWORDS, "parse(src,**kw)"},
{NULL, NULL} /* sentinel */
};
static PyObject* _get_OB(char* name,PyObject* ob)
{
char buf[128];
if(ob){
Py_INCREF(ob);
return ob;
}
sprintf(buf,"Unknown attribute %s", name);
PyErr_SetString(PyExc_AttributeError, buf);
return NULL;
}
static PyObject* pyRXPParser_getattr(pyRXPParserObject *self, char *name)
{
int i;
if(!strcmp(name,"warnCB")) return _get_OB(name,self->warnCB);
else if(!strcmp(name,"eoCB")) return _get_OB(name,self->eoCB);
else if(!strcmp(name,"fourth")) return _get_OB(name,self->fourth);
else if(!strcmp(name,"srcName")){
Py_INCREF(self->srcName);
return self->srcName;
}
else {
for(i=0;flag_vals[i].k;i++)
if(!strcmp(flag_vals[i].k,name))
return PyInt_FromLong(__GetFlag(self,(ParserFlag)i));
}
return Py_FindMethod(pyRXPParser_methods, (PyObject *)self, name);
}
static void pyRXPParserFree(pyRXPParserObject* self)
{
Py_XDECREF(self->srcName);
Py_XDECREF(self->warnCB);
Py_XDECREF(self->eoCB);
Py_XDECREF(self->fourth);
#if 0
/*this could be called if we're never going to use the parser again*/
deinit_parser();
#endif
PyMem_DEL(self);
}
static PyTypeObject pyRXPParserType = {
PyObject_HEAD_INIT(0)
0, /*ob_size*/
"pyRXPParser", /*tp_name*/
sizeof(pyRXPParserObject), /*tp_basicsize*/
0, /*tp_itemsize*/
/* methods */
(destructor)pyRXPParserFree, /*tp_dealloc*/
(printfunc)0, /*tp_print*/
(getattrfunc)pyRXPParser_getattr, /*tp_getattr*/
(setattrfunc)pyRXPParser_setattr, /*tp_setattr*/
(cmpfunc)0, /*tp_compare*/
(reprfunc)0, /*tp_repr*/
0, /*tp_as_number*/
0, /*tp_as_sequence*/
0, /*tp_as_mapping*/
(hashfunc)0, /*tp_hash*/
(ternaryfunc)pyRXPParser_parse, /*tp_call*/
(reprfunc)0, /*tp_str*/
/* Space for future expansion */
0L,0L,0L,0L,
/* Documentation string */
"pyRXPParser instance, see pyRXP doc string for details."
};
static pyRXPParserObject* pyRXPParser(PyObject* module, PyObject* args, PyObject* kw)
{
pyRXPParserObject* self;
int i;
if(!PyArg_ParseTuple(args, ":Parser")) return NULL;
if(!(self = PyObject_NEW(pyRXPParserObject, &pyRXPParserType))) return NULL;
self->warnCB = self->eoCB = self->fourth = (void*)self->srcName = NULL;
if(!(self->srcName=PyString_FromString("[unknown]"))){
PyErr_SetString(moduleError,"Internal error, memory limit reached!");
Lfree: pyRXPParserFree(self);
return NULL;
}
for(i=0;flag_vals[i].k;i++)
__SetFlag(self,(ParserFlag)i,PyInt_AsLong(PyDict_GetItemString(parser_flags,flag_vals[i].k)));
if(kw){
PyObject *key, *value;
i = 0;
while(PyDict_Next(kw,&i,&key,&value))
if(pyRXPParser_setattr(self, PyString_AsString(key), value)) goto Lfree;
}
return self;
}
static struct PyMethodDef moduleMethods[] = {
{"Parser", (PyCFunction)pyRXPParser,
METH_VARARGS|METH_KEYWORDS,
"Parser(*kw) create a pyRXP parser instance"
},
{NULL, NULL} /*sentinel*/
};
#if CHAR_SIZE == 16
DL_EXPORT(void) inituRXP(void)
#elif CHAR_SIZE == 8
DL_EXPORT(void) initpyRXP(void)
#endif
{
PyObject *m, *d, *v, *t;
int i;
/*set up the types by hand*/
pyRXPParserType.ob_type = &PyType_Type;
/* Create the module and add the functions */
m = Py_InitModule(MODULE, moduleMethods);
/* Add some symbolic constants to the module */
d = PyModule_GetDict(m);
moduleVersion = PyString_FromString(VERSION);
PyDict_SetItemString(d, "version", moduleVersion );
RXPVersion = PyString_FromString(rxp_version_string);
PyDict_SetItemString(d, "RXPVersion", RXPVersion );
moduleError = PyErr_NewException(MODULE ".Error",NULL,NULL);
PyDict_SetItemString(d,"error",moduleError);
parser_flags = PyDict_New();
for(i=0;flag_vals[i].k;i++){
PyDict_SetItemString(parser_flags, flag_vals[i].k,
t=PyInt_FromLong(flag_vals[i].v));
Py_DECREF(t);
}
PyDict_SetItemString(d,"parser_flags",parser_flags);
/*add in the docstring*/
v = PyString_FromString(moduleDoc);
PyDict_SetItemString(d, "__doc__", v);
Py_DECREF(v);
#if CHAR_SIZE == 16
com_head = (Char*)PyMem_Malloc(10);
com_tail = (Char*)PyMem_Malloc(8);
char8_to_Char("<!--",com_head);
char8_to_Char("-->",com_tail);
#endif
}
--Apple-Mail-2-731357005
Content-Disposition: attachment;
filename=benchmarks.py
Content-Transfer-Encoding: quoted-printable
Content-Type: application/octet-stream;
x-unix-mode=0664;
name="benchmarks.py"
#=20benchmark=0A#=20MSXML:=20=20This=20can=20be=20downloaded=20from=20=
many=20places.=20=20You=20need=203.0=0A#=20which=20is=20NOT=20in=20most=20=
newly=20installed=20Windows=20boxes.=20(650kb)=0A#=20=
http://download.microsoft.com/download/xml/Install/3.0/WIN98Me/EN-US/msxml=
3.exe=0A#=20=20=20=20for=20a=20quick=20tutorial=20on=20MSXML=203.0,=20=
see=0A#=20http://www.perfectxml.com/articles/xml/msxml30.asp=0A=0A#=20=
you=20should=20then=20run=20the=20COM=20MakePY=20utility=20on=20the=20=
Pythonwin=20menu.=0A#=20to=20get=20it=20going=20as=20fast=20as=20=
possible.=0A=0A=0Aimport=20sys=0Aimport=20glob=0Aimport=20time=0Aimport=20=
string=0Afrom=20types=20import=20TupleType=0Aimport=20cStringIO=0Aimport=20=
os=0Aimport=20os.path=0A=20=20=20=20=0Adef=20tupleTreeStats(node):=0A=20=20=
=20=20#=20counts=20tags=20and=20attributes=20recursively=0A=20=20=20=20#=20=
use=20for=20all=20reportlab=20parsers=0A=20=20=20=20if=20node[1]=20is=20=
None:=0A=20=20=20=20=20=20=20=20attrCount=20=3D=200=0A=20=20=20=20else:=0A=
=20=20=20=20=20=20=20=20attrCount=20=3D=20len(node[1])=0A=20=20=20=20=
nodeCount=20=3D=201=0A=20=20=20=20if=20node[2]=20is=20not=20None:=0A=20=20=
=20=20=20=20=20=20for=20child=20in=20node[2]:=0A=20=20=20=20=20=20=20=20=20=
=20=20=20if=20type(child)=20is=20TupleType:=0A=20=20=20=20=20=20=20=20=20=
=20=20=20=20=20=20=20a,=20n=20=3D=20tupleTreeStats(child)=0A=20=20=20=20=20=
=20=20=20=20=20=20=20=20=20=20=20attrCount=20=3D=20attrCount=20+=20a=0A=20=
=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20nodeCount=20=3D=20nodeCount=20=
+=20n=0A=20=20=20=20return=20attrCount,=20nodeCount=0A=0A###=20=20pyRXP=20=
-=20our=20wrapper=20around=20Univ=20of=20Edinburgh=0A=0Adef=20=
getPyRXPParser():=0A=20=20=20=20import=20pyRXP=0A=20=20=20=20p=20=3D=20=
pyRXP.Parser()=0A=20=20=20=20return=20p=0A=0Adef=20=
getNonValidatingPyRXPParser():=0A=20=20=20=20import=20pyRXP=0A=20=20=20=20=
p=20=3D=20pyRXP.Parser(Validate=3D0)=0A=20=20=20=20return=20p=0A=0Adef=20=
parseWithPyRXP(parser,=20rawdata):=0A=20=20=20=20return=20=
parser.parse(rawdata)=0A=0A###=20=20uRXP=20-=20Unicode=20version=20of=20=
pyRXP=0A=0Adef=20getuRXPParser():=0A=20=20=20=20import=20uRXP=0A=20=20=20=
=20p=20=3D=20uRXP.Parser()=0A=20=20=20=20return=20p=0A=0Adef=20=
getNonValidatinguRXPParser():=0A=20=20=20=20import=20uRXP=0A=20=20=20=20=
p=20=3D=20uRXP.Parser(Validate=3D0)=0A=20=20=20=20return=20p=0A=0Adef=20=
parseWithuRXP(parser,=20rawdata):=0A=20=20=20=20return=20=
parser.parse(rawdata)=0A=0A=0A###=20=20rparsexml=20-=20Aaron's=20very=20=
fast=20pure=20python=20parser=0A=0Adef=20loadRparseXML():=0A=20=20=20=20=
#it's=20a=20module,=20what=20the=20heck=0A=20=20=20=20from=20=
rlextra.radxml=20import=20rparsexml=0A=20=20=20=20return=20rparsexml=0A=0A=
def=20parseWithRParseXML(rparsexml,=20rawdata):=0A=20=20=20=20#first=20=
argument=20is=20a=20dummy=20holding=20none=0A=20=20=20=20return=20=
rparsexml.parsexml0(rawdata)[0]=20=0A=0A###=20=20expattree=20-=20=
tree-building=20wrapper=20around=20pyexpat=0Adef=20getExpatParser():=0A=20=
=20=20=20import=20expattree=0A=20=20=20=20return=20=
expattree.ExpatTreeParser()=0A=20=20=20=20=0Adef=20=
parseWithExpat(expatParser,=20rawdata):=0A=20=20=20=20#first=20argument=20=
is=20a=20dummy=20holding=20none=0A=20=20=20=20return=20=
expatParser.parse(rawdata)=0A=0A#######=20minidom=20-=20non-validating=20=
DOM=20parser=20in=20the=20Python=20distro=0A=0Adef=20loadMiniDOM():=0A=20=
=20=20=20import=20xml.dom.minidom=0A=20=20=20=20return=20xml.dom.minidom=0A=
=0Adef=20parseWithMiniDOM(dom_module,=20rawdata):=0A=20=20=20=20#parser=20=
is=20None=0A=20=20=20=20return=20dom_module.parseString(rawdata)=0A=20=20=
=20=20=0Adef=20statsWithMiniDOM(node):=0A=20=20=20=20return=20(1,=200)=0A=
=0A#########=20=20Microsoft=20XML=20Parser=20via=20COM=20=
######################=0A=0A=0Adef=20loadMSXML30():=0A=20=20=20=20from=20=
win32com.client=20import=20Dispatch=0A=20=20=20=20msx=20=3D=20=
Dispatch('Microsoft.XMLDOM')=0A=20=20=20=20return=20msx=0A=0Adef=20=
parseWithMSXML30(msx,=20rawdata):=0A=20=20=20=20msx.loadXML(rawdata)=0A=20=
=20=20=20return=20msx=0A=0Adef=20statsWithMSXML30(node):=0A=20=20=20=20=
#not=20done=0A=20=20=20=20return=20(1,0)=20=20=20=20=0A=0A=
###########4DOM=20###############=0Adef=20load4DOM():=0A=20=20=20=20from=20=
xml.dom.ext.reader=20import=20PyExpat=0A=20=20=20=20from=20xml.dom=20=
import=20Node=0A=20=20=20=20reader=20=3D=20PyExpat.Reader()=0A=20=20=20=20=
return=20reader=0A=0Adef=20parseWith4DOM(reader,=20rawdata):=0A=20=20=20=20=
return=20reader.fromString(rawdata)=0A=0A=0Adef=20statsWith4DOM(node):=0A=
=20=20=20=20#node=0A=20=20=20=20return=20(1,0)=0A=0Adef=20=
loadCDomlette():=0A=20=20=20=20from=20Ft.Lib=20import=20cDomlettec=0A=20=20=
=20=20return=20cDomlettec=0A=0Adef=20parseWithCDomlette(modul,=20=
rawdata):=0A=20=20=20=20io=20=3D=20cStringIO.StringIO(rawdata)=0A=20=20=20=
=20return=20modul.parse(io,=20'')=0A=0Adef=20statsWithCDomlette(node):=0A=
=20=20=20=20#node=0A=20=20=20=20return=20(1,0)=0A=0D=0A##########put=20=
them=20all=20together################=0A=0ATESTMAP=20=3D=20[=0A=20=20=20=20=
#=20name=20of=20parser;=20function=20to=20initialize=20if=20needed;=0A=20=
=20=20=20#=20function=20to=20parse;=20function=20to=20do=20stats=0A=20=20=
=20=20('pyRXP',=20getPyRXPParser,=20parseWithPyRXP,=20tupleTreeStats),=0A=
=20=20=20=20('pyRXP_nonvalidating',=20getNonValidatingPyRXPParser,=20=
parseWithPyRXP,=20tupleTreeStats),=0A=20=20=20=20('uRXP',=20=
getuRXPParser,=20parseWithuRXP,=20tupleTreeStats),=0A=20=20=20=20=
('uRXP_nonvalidating',=20getNonValidatinguRXPParser,=20parseWithuRXP,=20=0A=
=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20tupleTreeStats),=0A=20=20=
=20=20('rparsexml',=20loadRparseXML,=20parseWithRParseXML,=20=
tupleTreeStats),=0A=20=20=20=20('expat',=20getExpatParser,=20=
parseWithExpat,=20tupleTreeStats),=0A=20=20=20=20('minidom',=20=
loadMiniDOM,=20parseWithMiniDOM,=20statsWithMiniDOM),=0A=20=20=20=20=
('msxml30',=20loadMSXML30,=20parseWithMSXML30,=20statsWithMSXML30),=0A=20=
=20=20=20('4dom',=20load4DOM,=20parseWith4DOM,=20statsWith4DOM),=0A=20=20=
=20=20('cdomlette',=20loadCDomlette,=20parseWithCDomlette,=20=
statsWithCDomlette)=0A=20=20=20=20]=20=20=20=20=0A=0Adef=20=
interact(testName=3DNone,=20dtd=3D1,=20pause=3D'unknown'):=0A=0A=20=20=20=
=20#=20if=20no=20DTD=20requested,=20trim=20off=20first=202=20lines;=20=
the=20lack=20of=0A=20=20=20=20#=20a=20DTD=20reference=20will=20put=20=
validating=20parsers=20into=20non-=0A=20=20=20=20#=20validating=20mode=0A=
=20=20=20=20if=20dtd:=0A=20=20=20=20=20=20=20=20sampleText=20=3D=20=
open('rml_a.xml').read()=0A=20=20=20=20else:=0A=20=20=20=20=20=20=20=20=
print=20'DTD=20declaration=20removed,=20non-validating'=0A=20=20=20=20=20=
=20=20=20lines=20=3D=20open('rml_a.xml').readlines()[2:]=0A=20=20=20=20=20=
=20=20=20sampleText=20=3D=20string.join(lines,'')=0A=20=20=20=20=20=20=20=
=20=0A=20=20=20=20if=20testName:=0A=20=20=20=20=20=20=20=20found=20=3D=20=
0=0A=20=20=20=20=20=20=20=20for=20row=20in=20TESTMAP:=0A=20=20=20=20=20=20=
=20=20=20=20=20=20if=20row[0]=20=3D=3D=20testName:=0A=20=20=20=20=20=20=20=
=20=20=20=20=20=20=20=20=20found=20=3D=201=0A=20=20=20=20=20=20=20=20=20=20=
=20=20=20=20=20=20(name,=20loadFunc,=20parseFunc,=20statFunc)=20=3D=20=
row=0A=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20break=0A=20=20=20=20=
=20=20=20=20if=20not=20found:=0A=20=20=20=20=20=20=20=20=20=20=20=20=
print=20'parser=20%s=20not=20found,=20please=20select'=20%=20testName=0A=0A=
=20=20=20=20if=20not=20testName:=20=20=20=20=20=20=20=20=20=20=20=20=0A=20=
=20=20=20#=20interactive,=20show=20stuff=0A=20=20=20=20=20=20=20=20print=20=
"Interactive=20benchmark=20suite=20for=20Python=20XML=20tree-parsers."=0A=
=20=20=20=20=20=20=20=20print=20'Using=20sample=20XML=20file=20%d=20=
bytes=20long'=20%=20len(sampleText)=0A=20=20=20=20=20=20=20=20print=20=
"Parsers=20available:"=0A=20=20=20=20=20=20=20=20i=20=3D=201=0A=20=20=20=20=
=20=20=20=20for=20(name,=20a,=20b,=20c)=20in=20TESTMAP:=0A=20=20=20=20=20=
=20=20=20=20=20=20=20print=20'\t%d.=20=20%s'=20%=20(i,=20name)=0A=20=20=20=
=20=20=20=20=20=20=20=20=20i=20=3D=20i=20+=201=0A=20=20=20=20=20=20=20=20=
print=0A=20=20=20=20=20=20=20=20inp=20=3D=20raw_input('Parser=20number=20=
(or=20x=20to=20exit)=20>=20')=0A=20=20=20=20=20=20=20=20if=20inp=20=3D=3D=20=
'x':=0A=20=20=20=20=20=20=20=20=20=20=20=20print=20'bye'=0A=20=20=20=20=20=
=20=20=20=20=20=20=20return=0A=20=20=20=20=20=20=20=20else:=0A=20=20=20=20=
=20=20=20=20=20=20=20=20num=20=3D=20int(inp)=0A=20=20=20=20=20=20=20=20=20=
=20=20=20(name,=20loadFunc,=20parseFunc,=20statFunc)=20=3D=20=
TESTMAP[num-1]=0A=0A=20=20=20=20#=20force=20pause=20to=201=20or=200=20by=20=
asking=0A=20=20=20=20if=20pause=20=3D=3D=20'unknown':=20=0A=20=20=20=20=20=
=20=20=20inp=20=3D=20raw_input("Shall=20we=20do=20memory=20tests?=20=20=
i.e.=20you=20look=20at=20Task=20Manager?=20y/n=20>=20")=0A=20=20=20=20=20=
=20=20=20assert=20inp=20in=20'yn',=20'enter=20"y"=20or=20"n".=20=20=
Please=20run=20again!'=0A=20=20=20=20=20=20=20=20pause=20=3D=20(inp=20=3D=3D=
=20'y')=0A=0A=0A=0A=20=20=20=20print=20'testing=20%s'=20%=20testName=0A=20=
=20=20=20#load=20the=20parser=0A=20=20=20=20t0=20=3D=20time.clock()=0A=20=
=20=20=20parser=20=3D=20loadFunc()=0A=20=20=20=20loadTime=20=3D=20=
time.clock()=20-=20t0=0A=20=20=20=20if=20pause:=0A=20=20=20=20=20=20=20=20=
baseMem=20=3D=20float(raw_input("Pre-parsing:=20please=20input=20python=20=
process=20memory=20in=20kb=20>=20"))=0A=20=20=20=20t1=20=3D=20=
time.clock()=0A=20=20=20=20parsedOutput=20=3D=20parseFunc(parser,=20=
sampleText)=0A=20=20=20=20t2=20=3D=20time.clock()=0A=20=20=20=20=
parseTime=20=3D=20t2=20-=20t1=0A=20=20=20=20=0A=20=20=20=20if=20pause:=0A=
=20=20=20=20=20=20=20=20totalMem=20=3D=20float(raw_input('Post-parsing:=20=
please=20input=20python=20process=20memory=20in=20kb=20>=20'))=0A=20=20=20=
=20=20=20=20=20usedMem=20=3D=20totalMem=20-=20baseMem=0A=20=20=20=20=20=20=
=20=20memFactor=20=3D=20usedMem=20*=201024.0=20/=20len(sampleText)=0A=20=20=
=20=20t3=20=3D=20time.clock()=0A=20=20=20=20n,=20a=20=3D=20=
statFunc(parsedOutput)=0A=20=20=20=20t4=20=3D=20time.clock()=0A=20=20=20=20=
traverseTime=20=3D=20t4=20-=20t3=0A=20=20=20=20print=20'counted=20%d=20=
tags,=20%d=20attributes'=20%=20(n,=20a)=0A=20=20=20=20if=20pause:=0A=20=20=
=20=20=20=20=20=20print=20'%s:=20init=20%0.4f,=20parse=20%0.4f,=20=
traverse=20%0.4f,=20mem=20used=20%dkb,=20mem=20factor=20%0.2f'=20%=20(=0A=
=20=20=20=20=20=20=20=20=20=20=20=20name,=20loadTime,=20parseTime,=20=
traverseTime,=20usedMem,=20memFactor)=0A=20=20=20=20else:=0A=20=20=20=20=20=
=20=20=20print=20'%s:=20init=20%0.4f,=20parse=20%0.4f,=20traverse=20=
%0.4f'=20%=20(=0A=20=20=20=20=20=20=20=20=20=20=20=20name,=20loadTime,=20=
parseTime,=20traverseTime)=0A=20=20=20=20print=0A=0A=20=20=20=20=0Aif=20=
__name__=3D=3D'__main__':=0A=20=20=20=20import=20sys=0A=20=20=20=20args=20=
=3D=20sys.argv[:]=0A=20=20=20=20if=20'-nodtd'=20in=20args:=0A=20=20=20=20=
=20=20=20=20dtd=3D0=0A=20=20=20=20=20=20=20=20args.remove('-nodtd')=0A=20=
=20=20=20else:=0A=20=20=20=20=20=20=20=20dtd=3D1=0A=20=20=20=20=20=20=20=20=
=0A=20=20=20=20if=20'-pause'=20in=20args:=0A=20=20=20=20=20=20=20=20=
pause=20=3D=201=0A=20=20=20=20=20=20=20=20args.remove('-pause')=0A=20=20=20=
=20elif=20'-nopause'=20in=20args:=0A=20=20=20=20=20=20=20=20pause=20=3D=20=
0=0A=20=20=20=20=20=20=20=20args.remove('-nopause')=0A=20=20=20=20else:=0A=
=20=20=20=20=20=20=20=20pause=20=3D=20'unknown'=20=20#=20it=20will=20ask=0A=
=20=20=20=20if=20len(args)=20>=201:=0A=20=20=20=20=20=20=20=20testName=20=
=3D=20args[1]=0A=20=20=20=20else:=0A=20=20=20=20=20=20=20=20testName=20=3D=
=20None=0A=20=20=20=20interact(testName,=20dtd,=20pause=3Dpause)=0A=20=20=
=20=20=0A=20=20=20=20=20=20=20=20=0A=
--Apple-Mail-2-731357005
Content-Disposition: attachment;
filename=xmlparser.h
Content-Transfer-Encoding: 7bit
Content-Type: text/plain;
x-unix-mode=0664;
name="xmlparser.h"
/* $Id: xmlparser.h,v 1.2 2002/03/22 11:00:37 rgbecker Exp $ */
#ifndef XMLPARSER_H
#define XMLPARSER_H
#include "dtd.h"
#include "input.h"
#include "rxputil.h"
#include "namespaces.h"
#ifdef FOR_LT
#include "lt-hash.h"
typedef HashTab *HashTable;
#else
#include "hash.h"
#endif
/* Typedefs */
typedef struct parser_state *Parser;
typedef struct attribute *Attribute;
typedef struct xbit *XBit;
typedef void CallbackProc(XBit bit, void *arg);
typedef InputSource EntityOpenerProc(Entity e, void *arg);
/* Bits */
enum xbit_type {
XBIT_dtd,
XBIT_start, XBIT_empty, XBIT_end, XBIT_eof, XBIT_pcdata,
XBIT_pi, XBIT_comment, XBIT_cdsect,
XBIT_error, XBIT_warning, XBIT_none,
XBIT_enum_count
};
typedef enum xbit_type XBitType;
extern XML_API const char8 *XBitTypeName[XBIT_enum_count];
enum white_space_mode {
WSM_unspecified, WSM_default, WSM_preserve
};
typedef enum white_space_mode WhiteSpaceMode;
struct namespace_binding {
const Char *prefix; /* points into an attribute name, or is null */
Namespace RXP_NAMESPACE; /* that's namespace or name_space in C++ */
struct namespace_binding *parent;
};
typedef struct namespace_binding *NamespaceBinding;
struct attribute {
AttributeDefinition definition; /* The definition of this attribute */
NSAttributeDefinition ns_definition;
Char *value; /* The (possibly normalised) value */
int quoted; /* Was it quoted? */
int specified; /* Was it not defaulted? */
struct attribute *next; /* The next attribute or null */
};
struct xbit {
Entity entity;
int byte_offset;
enum xbit_type type;
char8 *s1;
Char *S1, *S2;
int i1;
Attribute attributes;
ElementDefinition element_definition;
WhiteSpaceMode wsm;
NamespaceBinding ns_dict; /* Linked list of namespace bindings */
int nsc; /* Count of local ns records */
int nsowned; /* True if ns recs should be freed with bit */
NSElementDefinition ns_element_definition;
/* Null if no prefix and no default ns */
#ifndef FOR_LT
int nchildren;
struct xbit *parent;
struct xbit **children;
#endif
};
#define pcdata_chars S1
#define pcdata_ignorable_whitespace i1
#define pi_name S1
#define pi_chars S2
#define comment_chars S1
#define cdsect_chars S1
#define error_message s1
/* Parser flags */
enum parser_flag {
ExpandCharacterEntities,
ExpandGeneralEntities,
XMLSyntax,
XMLPredefinedEntities,
ErrorOnUnquotedAttributeValues,
NormaliseAttributeValues,
ErrorOnBadCharacterEntities,
ErrorOnUndefinedEntities,
ReturnComments,
ReturnProcessingInstructions,
CaseInsensitive,
ErrorOnUndefinedElements,
ErrorOnUndefinedAttributes,
WarnOnRedefinitions,
TrustSDD,
XMLExternalIDs,
ReturnDefaultedAttributes,
MergePCData,
XMLMiscWFErrors,
XMLStrictWFErrors,
AllowMultipleElements,
MaintainElementStack,
IgnoreEntities,
XMLLessThan,
IgnorePlacementErrors,
Validate,
ErrorOnValidityErrors,
XMLSpace,
XMLNamespaces,
NoNoDTDWarning,
SimpleErrorFormat,
AllowUndeclaredNSAttributes,
RelaxedAny,
ReturnNamespaceAttributes,
ProcessDTD
};
typedef enum parser_flag ParserFlag;
#define NormalizeAttributeValues NormaliseAttributeValues
/* Parser */
enum parse_state
{PS_prolog1, PS_prolog2, PS_validate_dtd,
PS_body, PS_validate_final, PS_epilog, PS_end, PS_error};
struct element_info {
ElementDefinition definition;
NSElementDefinition ns_definition;
Entity entity;
FSMNode context;
WhiteSpaceMode wsm;
NamespaceBinding ns;
int nsc;
};
struct parser_state {
enum parse_state state;
int seen_validity_error;
Entity document_entity;
int have_dtd; /* True if dtd has been processed */
StandaloneDeclaration standalone;
struct input_source *source;
Char *name, *pbuf, *save_pbuf;
char8 *transbuf;
char8 errbuf[400]; /* For error messages; fixed size is bad but
we don't want to fail if we can't malloc */
char8 escbuf[2][15];
int namelen, pbufsize, pbufnext, save_pbufsize, save_pbufnext;
struct xbit xbit;
int peeked;
Dtd dtd; /* The document's DTD */
CallbackProc *dtd_callback;
CallbackProc *warning_callback;
EntityOpenerProc *entity_opener;
unsigned int flags[2]; /* We now have >32 flags */
Vector(struct element_info, element_stack);
struct namespace_binding base_ns;
void *callback_arg;
int external_pe_depth; /* To keep track of whether we're in the */
/* internal subset: 0 <=> yes */
HashTable id_table;
};
XML_API int init_parser(void);
XML_API void deinit_parser(void);
XML_API Parser NewParser(void);
XML_API void FreeParser(Parser p);
XML_API Entity ParserRootEntity(Parser p);
XML_API InputSource ParserRootSource(Parser p);
XML_API XBit ReadXBit(Parser p);
XML_API XBit PeekXBit(Parser p);
XML_API void FreeXBit(XBit xbit);
#ifndef FOR_LT
XBit ReadXTree(Parser p);
void FreeXTree(XBit tree);
#endif
XML_API XBit ParseDtd(Parser p, Entity e);
XML_API void ParserSetWarningCallback(Parser p, CallbackProc cb);
XML_API void ParserSetDtdCallback(Parser p, CallbackProc cb);
XML_API void ParserSetEntityOpener(Parser p, EntityOpenerProc opener);
XML_API void ParserSetCallbackArg(Parser p, void *arg);
XML_API int ParserPush(Parser p, InputSource source);
XML_API void ParserPop(Parser p);
XML_API void ParserSetFlag(Parser p, ParserFlag flag, int value);
#define ParserGetFlag(p, flag) \
(((flag) < 32) ? ((p)->flags[0] & (1u << (flag))) : ((p)->flags[1] & (1u << ((flag)-32))))
XML_API void _ParserPerror(FILE16 *f, Parser p, XBit bit);
XML_API void ParserPerror(Parser p, XBit bit);
#endif /* XMLPARSER_H */
--Apple-Mail-2-731357005
Content-Transfer-Encoding: 7bit
Content-Type: text/plain;
charset=US-ASCII;
format=flowed
--
Stuart Bishop <zen@shangri-la.dropbear.id.au>
http://shangri-la.dropbear.id.au/
--Apple-Mail-2-731357005--