* Removed special exception handling for XmlFile processing (29c66419) · Commits · disa / public / MESSIF Utils

messif/objects/text/lucene/LuceneAlgorithm.java

0 → 100644

+84 −0

Original line number	Diff line number	Diff line
		package messif.objects.text.lucene;

		import java.io.File;
		import java.io.IOException;
		import messif.algorithms.Algorithm;
		import messif.objects.NoDataObject;
		import messif.objects.impl.ObjectString;
		import messif.operations.query.KNNQueryOperation;
		import org.apache.lucene.analysis.standard.StandardAnalyzer;
		import org.apache.lucene.document.Document;
		import org.apache.lucene.index.CorruptIndexException;
		import org.apache.lucene.index.IndexReader;
		import org.apache.lucene.queryParser.MultiFieldQueryParser;
		import org.apache.lucene.queryParser.ParseException;
		import org.apache.lucene.queryParser.QueryParser;
		import org.apache.lucene.search.IndexSearcher;
		import org.apache.lucene.search.ScoreDoc;
		import org.apache.lucene.search.Searcher;
		import org.apache.lucene.search.TopDocs;
		import org.apache.lucene.store.FSDirectory;
		import org.apache.lucene.util.Version;

		/**
		* Encapsulation of a Lucene index into MESSIF algorithm.
		* Note that this algorithm works only with {@link ObjectString} objects
		* and the distance function is ignored. Note also, that this algorithm
		* is not dynamic, thus no data-manipulation operations are supported.
		*
		* @author xbatko
		*/
		public class LuceneAlgorithm extends Algorithm {
		/** class serial id for serialization */
		private static final long serialVersionUID = 1L;

		/** Lucene index searcher */
		private final Searcher searcher;
		/** Lucene query parser */
		private final QueryParser queryParser;
		/** Name of the field that stores the document locator */
		private final String locatorField;

		/**
		* Creates a new Lucene algorithm on a given index directory.
		* The index file must be created first (e.g. by using CoPhIRSAXAllInOneIndexer).
		*
		* @param indexDir the Lucene index directory
		* @param locatorField the name of the field in the indexed documents where the locator is stored
		* @param searchFields the names of the fields in the indexed documents that are used for searching
		* @throws CorruptIndexException if the index file is corrupted
		* @throws IOException if there was an error reading from the index file
		*/
		@AlgorithmConstructor(description="Lucene index", arguments={"index directory", "locator field name", "search field names array"})
		public LuceneAlgorithm(File indexDir, String locatorField, String... searchFields) throws CorruptIndexException, IOException {
		super("Lucene index in " + indexDir);

		// Open the index
		this.searcher = new IndexSearcher(IndexReader.open(FSDirectory.open(indexDir), true)); // just searching, so read-only=true

		// Set the locator field
		this.locatorField = locatorField;

		// Initialize query parser
		StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
		if (searchFields.length == 1)
		this.queryParser = new QueryParser(Version.LUCENE_CURRENT, searchFields[0], analyzer);
		else
		this.queryParser = new MultiFieldQueryParser(Version.LUCENE_CURRENT, searchFields, analyzer);
		}

		/**
		* Search the index and returns the results.
		* @param operation the kNN operation with {@link ObjectString} query object
		* @throws ParseException if there was an error parsing the search text
		* @throws IOException if there was an error reading data from the index
		*/
		public void knnSearch(KNNQueryOperation operation) throws ParseException, IOException {
		TopDocs topDocs = searcher.search(queryParser.parse(((ObjectString)operation.getQueryObject()).getStringData()), operation.getK());
		for (ScoreDoc match : topDocs.scoreDocs) {
		Document doc = searcher.doc(match.doc);
		operation.addToAnswer(new NoDataObject(doc.get(locatorField)), match.score, null);
		}
		operation.endOperation();
		}
		}

messif/objects/text/lucene/LuceneBuilder.java

0 → 100644

+88 −0

Original line number	Diff line number	Diff line
		package messif.objects.text.lucene;

		import java.io.Closeable;
		import java.io.File;
		import java.io.IOException;
		import java.io.InputStream;
		import javax.xml.parsers.SAXParser;
		import javax.xml.parsers.SAXParserFactory;
		import messif.utility.XmlFileProcessor;
		import messif.utility.XmlFileSearch;
		import messif.utility.reflection.InstantiatorSignature;
		import org.apache.lucene.analysis.Analyzer;
		import org.apache.lucene.analysis.standard.StandardAnalyzer;
		import org.apache.lucene.index.IndexWriter;
		import org.apache.lucene.store.FSDirectory;
		import org.apache.lucene.util.Version;

		/**
		* Processor for creating Lucene indexes.
		* @author xbatko
		*/
		public class LuceneBuilder implements XmlFileProcessor, Closeable {
		/** Index writer */
		private final IndexWriter writer;
		/** XML document parser */
		private final SAXParser parser;
		/** XML document parsing handler */
		private final LuceneXmlFileHandler handler;

		/**
		* Creates a Lucene index builder.
		* @param indexDir the directory where the index will be created (add to, if it already exists)
		* @param handler the XML document parsing handler
		* @param analyzer the Lucene analyzer for text normalization
		* @throws Exception if there was a problem initializing the builder
		*/
		protected LuceneBuilder(File indexDir, LuceneXmlFileHandler handler, Analyzer analyzer) throws Exception {
		this.writer = new IndexWriter(
		FSDirectory.open(indexDir),
		analyzer == null ? new StandardAnalyzer(Version.LUCENE_CURRENT) : analyzer,
		true, // Create if not exists
		IndexWriter.MaxFieldLength.UNLIMITED
		);
		this.parser = SAXParserFactory.newInstance().newSAXParser();
		this.handler = handler;
		}

		public void xmlFileProcess(String source, InputStream xmlFileData) {
		try {
		parser.parse(xmlFileData, handler);
		writer.addDocument(handler.getLuceneDocument());
		handler.releaseLuceneDocument();
		} catch (Exception e) {
		System.err.print("Error processing '");
		System.err.print(source);
		System.err.print("': ");
		System.err.println(e);
		}
		}

		public void close() throws IOException {
		writer.optimize();
		writer.close();
		}

		/**
		* Builds the Lucene index.
		* @param args index directory, handler instance,
		* @throws Exception if there was an error
		*/
		@SuppressWarnings("UseOfSystemOutOrSystemErr")
		public static void main(String[] args) throws Exception {
		if (args.length < 3) {
		System.err.println("Usage: " + LuceneBuilder.class.getName() + " <directory> <handler instance> <dir\|TAR file> ...");
		System.exit(1);
		}
		LuceneBuilder processor = new LuceneBuilder(
		new File(args[0]),
		InstantiatorSignature.createInstanceWithStringArgs(args[1], LuceneXmlFileHandler.class, null),
		null
		);
		try {
		XmlFileSearch.process(processor, 2, args);
		} finally {
		processor.close();
		}
		}
		}

messif/objects/text/lucene/LuceneXmlFileHandler.java

0 → 100644

+96 −0

Original line number	Diff line number	Diff line
		package messif.objects.text.lucene;

		import org.apache.lucene.document.Document;
		import org.apache.lucene.document.Field;
		import org.xml.sax.SAXException;
		import org.xml.sax.helpers.DefaultHandler;

		/**
		* Abstract ancestor of all handlers that can parse a Lucene {@link Document}.
		* @author xbatko
		*/
		public abstract class LuceneXmlFileHandler extends DefaultHandler {
		/** Current Lucene document */
		private Document document;
		/** Name of the document field currently filled from the XML file */
		private String fieldName;
		/** Flag whether to store the currently filled field data in the index */
		private Field.Store fieldStore;
		/** Flag whether to index the currently filled field */
		private Field.Index fieldIndex;
		/** Data for the currently filled field */
		private StringBuilder fieldValue;

		/**
		* Starts a document field with character data within the tag.
		* @param name the name of the document field
		* @param store flag whether to store the field data in the index
		* @param index flag whether to index the field data
		*/
		protected final void startField(String name, Field.Store store, Field.Index index) {
		fieldName = name;
		fieldStore = store;
		fieldIndex = index;
		fieldValue = new StringBuilder();
		}

		/**
		* Starts a document field with the given value.
		* Note that the character data within the tag are ignored and the field
		* is added only if the value is not <tt>null</tt>.
		* @param name the name of the document field
		* @param value the value for the field
		* @param store flag whether to store the field data in the index
		* @param index flag whether to index the field data
		*/
		protected final void startField(String name, String value, Field.Store store, Field.Index index) {
		if (name != null && value != null)
		document.add(new Field(name, value, store, index));
		}

		/**
		* Finishes the current field and adds it to the document.
		*/
		protected final void endField() {
		if (fieldValue != null) {
		document.add(new Field(fieldName, fieldValue.toString(), fieldStore, fieldIndex));
		fieldName = null;
		fieldStore = null;
		fieldIndex = null;
		fieldValue = null;
		}
		}

		@Override
		public void startDocument() throws SAXException {
		document = new Document();
		}

		@Override
		public void characters(char[] ch, int start, int length) throws SAXException {
		if (fieldValue != null)
		fieldValue.append(ch, start, length);
		}

		@Override
		public void endElement(String uri, String localName, String qName) throws SAXException {
		endField();
		}

		/**
		* Returns the current Lucene document.
		* A new document is created whenever this handlers is used
		* to parse a new document, i.e. when {@link #startDocument()} is called.
		* @return the current Lucene document
		*/
		public Document getLuceneDocument() {
		return document;
		}

		/**
		* Releases the allocated Lucene document.
		*/
		public void releaseLuceneDocument() {
		document = null;
		}
		}

messif/objects/text/lucene/handlers/CoPhIRAllInOneHandler.java

0 → 100644

+33 −0

Original line number	Diff line number	Diff line
		package messif.objects.text.lucene.handlers;

		import messif.objects.text.lucene.LuceneXmlFileHandler;
		import org.apache.lucene.document.Field;
		import org.xml.sax.Attributes;
		import org.xml.sax.SAXException;

		/**
		* Implementation of the {@link LuceneXmlFileHandler} that extracts the content
		* of certain CoPhIR text attributes and elements.
		* The content is then stored in a Lucene document in a single field "allinone".
		*/
		public class CoPhIRAllInOneHandler extends LuceneXmlFileHandler {
		@Override
		public void startElement(String nsURI, String strippedName, String tagName, Attributes attributes) throws SAXException {
		if (tagName.equalsIgnoreCase("mediauri")) {
		startField("uri", Field.Store.YES, Field.Index.NO);
		} else if (tagName.equals("title")) {
		startField("allinone", Field.Store.YES, Field.Index.ANALYZED);
		} else if (tagName.equals("description")) {
		startField("allinone", Field.Store.YES, Field.Index.ANALYZED);
		} else if (tagName.equals("tag")) {
		startField("allinone", Field.Store.YES, Field.Index.ANALYZED);
		} else if (tagName.equals("comment")) {
		startField("allinone", Field.Store.NO, Field.Index.ANALYZED);
		} else if (tagName.equals("owner")) {
		startField("allinone", attributes.getValue("username"), Field.Store.YES, Field.Index.ANALYZED);
		startField("allinone", attributes.getValue("realname"), Field.Store.YES, Field.Index.ANALYZED);
		} else if (tagName.equals("dates")) {
		startField("taken", attributes.getValue("taken"), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
		}
		}
		}

messif/objects/text/lucene/handlers/CoPhIRFieldsHandler.java

0 → 100644

+33 −0

Original line number	Diff line number	Diff line
		package messif.objects.text.lucene.handlers;

		import messif.objects.text.lucene.LuceneXmlFileHandler;
		import org.apache.lucene.document.Field;
		import org.xml.sax.Attributes;
		import org.xml.sax.SAXException;

		/**
		* Implementation of the {@link LuceneXmlFileHandler} that extracts the content
		* of certain CoPhIR text attributes and elements.
		* The content is then stored in a Lucene document in corresponding fields.
		*/
		public class CoPhIRFieldsHandler extends LuceneXmlFileHandler {
		@Override
		public void startElement(String nsURI, String strippedName, String tagName, Attributes attributes) throws SAXException {
		if (tagName.equalsIgnoreCase("mediauri")) {
		startField("uri", Field.Store.YES, Field.Index.NO);
		} else if (tagName.equals("title")) {
		startField("title", Field.Store.YES, Field.Index.ANALYZED);
		} else if (tagName.equals("description")) {
		startField("description", Field.Store.YES, Field.Index.ANALYZED);
		} else if (tagName.equals("tag")) {
		startField("tag", Field.Store.YES, Field.Index.ANALYZED);
		} else if (tagName.equals("comment")) {
		startField("comment", Field.Store.NO, Field.Index.ANALYZED);
		} else if (tagName.equals("owner")) {
		startField("username", attributes.getValue("username"), Field.Store.YES, Field.Index.ANALYZED);
		startField("realname", attributes.getValue("realname"), Field.Store.YES, Field.Index.ANALYZED);
		} else if (tagName.equals("dates")) {
		startField("taken", attributes.getValue("taken"), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
		}
		}
		}