public class CophirXmlParser
extends org.xml.sax.helpers.DefaultHandler
Modifier and Type | Field and Description |
---|---|
protected static java.lang.String |
descriptorTagName
Name of the tag that encapsulates all MPEG7 visual descriptors
|
protected static java.lang.String |
descriptorTypeAttributeName
Name of attribute of the
descriptorTagName tag that contains the descriptor name |
protected static java.lang.String |
TEXT_SPLIT_REGEXP
Regular expression used to split the tag text into words
|
protected static java.lang.String |
wordObjectName
Name of the key words object
|
protected static java.util.List<java.lang.String> |
wordTagNames
Names of tags that contain words for keyword data
|
Constructor and Description |
---|
CophirXmlParser()
Creates a new handler for parsing CoPhIR XML files.
|
CophirXmlParser(Stemmer stemmer,
IntStorageIndexed<java.lang.String> wordIndex)
Creates a new handler for parsing CoPhIR XML files.
|
CophirXmlParser(Stemmer stemmer,
IntStorageIndexed<java.lang.String> wordIndex,
java.lang.String csvObjectName)
Creates a new handler for parsing CoPhIR XML files.
|
Modifier and Type | Method and Description |
---|---|
protected static void |
addDirectoryTriples(java.lang.StringBuilder fileName)
Prepends the fileName with its first three chars and second three chars as directories.
|
static CophirXmlParser |
create(java.io.File file,
Stemmer stemmer,
IntStorageIndexed<java.lang.String> wordIndex)
Factory method that parses the given CoPhIR XML file.
|
static CophirXmlParser |
create(java.io.File xmlDir,
java.lang.String identifier,
Stemmer stemmer,
IntStorageIndexed<java.lang.String> wordIndex)
Factory method that parses a CoPhIR XML file with the given identifier.
|
void |
endDocument() |
void |
endElement(java.lang.String uri,
java.lang.String localName,
java.lang.String qName) |
java.lang.String |
getLocatorURI()
Returns the parsed locator URI.
|
int |
getObjectCount()
Returns the number of the parsed descriptor objects.
|
java.util.Map<java.lang.String,LocalAbstractObject> |
getObjects()
Returns the parsed descriptor objects.
|
java.lang.String[] |
getTextFields()
Returns the textual data fields parsed from the
wordTagNames XML tags. |
void |
characters(char[] ch,
int start,
int length) |
static java.lang.String |
idToPath(java.lang.String id,
java.lang.String extension)
Returns a file path derived from the object identifier.
|
static ObjectIntMultiVectorJaccard |
parseKeyWordsType(java.lang.String[] texts,
Stemmer stemmer,
IntStorageIndexed<java.lang.String> wordIndex)
Parse the keywords descriptor data.
|
static java.lang.String |
pathToId(java.io.File file)
Returns an object identifier from a file.
|
static java.util.Iterator<java.lang.String> |
pathToId(java.util.Iterator<java.io.File> iterator)
Returns an iterator of object identifiers from an iterator of files.
|
void |
resetObjects()
Reset the parsed data to that this handler can be reused in additional parsing.
|
void |
startElement(java.lang.String uri,
java.lang.String localName,
java.lang.String qName,
org.xml.sax.Attributes attributes) |
protected static final java.lang.String descriptorTagName
protected static final java.lang.String descriptorTypeAttributeName
descriptorTagName
tag that contains the descriptor nameprotected static final java.util.List<java.lang.String> wordTagNames
protected static final java.lang.String wordObjectName
protected static final java.lang.String TEXT_SPLIT_REGEXP
public CophirXmlParser()
public CophirXmlParser(Stemmer stemmer, IntStorageIndexed<java.lang.String> wordIndex)
stemmer
- a Stemmer
for word transformationwordIndex
- the index for translating words to addresses
(if null the key words descriptor is not created)public CophirXmlParser(Stemmer stemmer, IntStorageIndexed<java.lang.String> wordIndex, java.lang.String csvObjectName)
stemmer
- a Stemmer
for word transformationwordIndex
- the index for translating words to addresses
(if null the key words descriptor is not created)csvObjectName
- the name of the CSV key words objectpublic static CophirXmlParser create(java.io.File file, Stemmer stemmer, IntStorageIndexed<java.lang.String> wordIndex) throws javax.xml.parsers.ParserConfigurationException, org.xml.sax.SAXException, java.io.IOException
file
- the CoPhIR XML file to read the object fromstemmer
- a Stemmer
for word transformationwordIndex
- the index for translating words to addresses
(if null the key words descriptor is not created)javax.xml.parsers.ParserConfigurationException
- if a XML parser cannot be createdorg.xml.sax.SAXException
- if there was an error parsing the XML filejava.io.IOException
- if there was an error reading the XML filepublic static CophirXmlParser create(java.io.File xmlDir, java.lang.String identifier, Stemmer stemmer, IntStorageIndexed<java.lang.String> wordIndex) throws javax.xml.parsers.ParserConfigurationException, org.xml.sax.SAXException, java.io.IOException
xmlDir
- the root directory where CoPhIR XML file are storedidentifier
- the CoPhIR object identifier to readstemmer
- a Stemmer
for word transformationwordIndex
- the index for translating words to addresses
(if null the key words descriptor is not created)javax.xml.parsers.ParserConfigurationException
- if a XML parser cannot be createdorg.xml.sax.SAXException
- if there was an error parsing the XML filejava.io.IOException
- if there was an error reading the XML filepublic java.util.Map<java.lang.String,LocalAbstractObject> getObjects()
public int getObjectCount()
public java.lang.String getLocatorURI()
public java.lang.String[] getTextFields()
wordTagNames
XML tags.public void resetObjects()
public void startElement(java.lang.String uri, java.lang.String localName, java.lang.String qName, org.xml.sax.Attributes attributes) throws org.xml.sax.SAXException
startElement
in interface org.xml.sax.ContentHandler
startElement
in class org.xml.sax.helpers.DefaultHandler
org.xml.sax.SAXException
public void endElement(java.lang.String uri, java.lang.String localName, java.lang.String qName) throws org.xml.sax.SAXException
endElement
in interface org.xml.sax.ContentHandler
endElement
in class org.xml.sax.helpers.DefaultHandler
org.xml.sax.SAXException
public void endDocument() throws org.xml.sax.SAXException
endDocument
in interface org.xml.sax.ContentHandler
endDocument
in class org.xml.sax.helpers.DefaultHandler
org.xml.sax.SAXException
public void characters(char[] ch, int start, int length) throws org.xml.sax.SAXException
characters
in interface org.xml.sax.ContentHandler
characters
in class org.xml.sax.helpers.DefaultHandler
org.xml.sax.SAXException
public static ObjectIntMultiVectorJaccard parseKeyWordsType(java.lang.String[] texts, Stemmer stemmer, IntStorageIndexed<java.lang.String> wordIndex) throws TextConversionException
texts
- the texts to parsestemmer
- a Stemmer
for word transformationwordIndex
- the index for translating words to addressesObjectIntMultiVectorJaccard
TextConversionException
- if there was an error stemming the wordprotected static void addDirectoryTriples(java.lang.StringBuilder fileName)
fileName
- fileName to modifypublic static java.lang.String idToPath(java.lang.String id, java.lang.String extension)
id
- the object identifierextension
- the file extension to add (no extension is added if null)public static java.lang.String pathToId(java.io.File file)
file
- the file for which to get the identifierpublic static java.util.Iterator<java.lang.String> pathToId(java.util.Iterator<java.io.File> iterator)
iterator
- the iterator of files for which to get the identifier