Loading messif/objects/text/lucene/LuceneAlgorithm.java 0 → 100644 +84 −0 Original line number Diff line number Diff line package messif.objects.text.lucene; import java.io.File; import java.io.IOException; import messif.algorithms.Algorithm; import messif.objects.NoDataObject; import messif.objects.impl.ObjectString; import messif.operations.query.KNNQueryOperation; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; /** * Encapsulation of a Lucene index into MESSIF algorithm. * Note that this algorithm works only with {@link ObjectString} objects * and the distance function is ignored. Note also, that this algorithm * is not dynamic, thus no data-manipulation operations are supported. * * @author xbatko */ public class LuceneAlgorithm extends Algorithm { /** class serial id for serialization */ private static final long serialVersionUID = 1L; /** Lucene index searcher */ private final Searcher searcher; /** Lucene query parser */ private final QueryParser queryParser; /** Name of the field that stores the document locator */ private final String locatorField; /** * Creates a new Lucene algorithm on a given index directory. * The index file must be created first (e.g. by using CoPhIRSAXAllInOneIndexer). * * @param indexDir the Lucene index directory * @param locatorField the name of the field in the indexed documents where the locator is stored * @param searchFields the names of the fields in the indexed documents that are used for searching * @throws CorruptIndexException if the index file is corrupted * @throws IOException if there was an error reading from the index file */ @AlgorithmConstructor(description="Lucene index", arguments={"index directory", "locator field name", "search field names array"}) public LuceneAlgorithm(File indexDir, String locatorField, String... searchFields) throws CorruptIndexException, IOException { super("Lucene index in " + indexDir); // Open the index this.searcher = new IndexSearcher(IndexReader.open(FSDirectory.open(indexDir), true)); // just searching, so read-only=true // Set the locator field this.locatorField = locatorField; // Initialize query parser StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); if (searchFields.length == 1) this.queryParser = new QueryParser(Version.LUCENE_CURRENT, searchFields[0], analyzer); else this.queryParser = new MultiFieldQueryParser(Version.LUCENE_CURRENT, searchFields, analyzer); } /** * Search the index and returns the results. * @param operation the kNN operation with {@link ObjectString} query object * @throws ParseException if there was an error parsing the search text * @throws IOException if there was an error reading data from the index */ public void knnSearch(KNNQueryOperation operation) throws ParseException, IOException { TopDocs topDocs = searcher.search(queryParser.parse(((ObjectString)operation.getQueryObject()).getStringData()), operation.getK()); for (ScoreDoc match : topDocs.scoreDocs) { Document doc = searcher.doc(match.doc); operation.addToAnswer(new NoDataObject(doc.get(locatorField)), match.score, null); } operation.endOperation(); } } messif/objects/text/lucene/LuceneBuilder.java 0 → 100644 +88 −0 Original line number Diff line number Diff line package messif.objects.text.lucene; import java.io.Closeable; import java.io.File; import java.io.IOException; import java.io.InputStream; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import messif.utility.XmlFileProcessor; import messif.utility.XmlFileSearch; import messif.utility.reflection.InstantiatorSignature; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; /** * Processor for creating Lucene indexes. * @author xbatko */ public class LuceneBuilder implements XmlFileProcessor, Closeable { /** Index writer */ private final IndexWriter writer; /** XML document parser */ private final SAXParser parser; /** XML document parsing handler */ private final LuceneXmlFileHandler handler; /** * Creates a Lucene index builder. * @param indexDir the directory where the index will be created (add to, if it already exists) * @param handler the XML document parsing handler * @param analyzer the Lucene analyzer for text normalization * @throws Exception if there was a problem initializing the builder */ protected LuceneBuilder(File indexDir, LuceneXmlFileHandler handler, Analyzer analyzer) throws Exception { this.writer = new IndexWriter( FSDirectory.open(indexDir), analyzer == null ? new StandardAnalyzer(Version.LUCENE_CURRENT) : analyzer, true, // Create if not exists IndexWriter.MaxFieldLength.UNLIMITED ); this.parser = SAXParserFactory.newInstance().newSAXParser(); this.handler = handler; } public void xmlFileProcess(String source, InputStream xmlFileData) { try { parser.parse(xmlFileData, handler); writer.addDocument(handler.getLuceneDocument()); handler.releaseLuceneDocument(); } catch (Exception e) { System.err.print("Error processing '"); System.err.print(source); System.err.print("': "); System.err.println(e); } } public void close() throws IOException { writer.optimize(); writer.close(); } /** * Builds the Lucene index. * @param args index directory, handler instance, * @throws Exception if there was an error */ @SuppressWarnings("UseOfSystemOutOrSystemErr") public static void main(String[] args) throws Exception { if (args.length < 3) { System.err.println("Usage: " + LuceneBuilder.class.getName() + " <directory> <handler instance> <dir|TAR file> ..."); System.exit(1); } LuceneBuilder processor = new LuceneBuilder( new File(args[0]), InstantiatorSignature.createInstanceWithStringArgs(args[1], LuceneXmlFileHandler.class, null), null ); try { XmlFileSearch.process(processor, 2, args); } finally { processor.close(); } } } messif/objects/text/lucene/LuceneXmlFileHandler.java 0 → 100644 +96 −0 Original line number Diff line number Diff line package messif.objects.text.lucene; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; /** * Abstract ancestor of all handlers that can parse a Lucene {@link Document}. * @author xbatko */ public abstract class LuceneXmlFileHandler extends DefaultHandler { /** Current Lucene document */ private Document document; /** Name of the document field currently filled from the XML file */ private String fieldName; /** Flag whether to store the currently filled field data in the index */ private Field.Store fieldStore; /** Flag whether to index the currently filled field */ private Field.Index fieldIndex; /** Data for the currently filled field */ private StringBuilder fieldValue; /** * Starts a document field with character data within the tag. * @param name the name of the document field * @param store flag whether to store the field data in the index * @param index flag whether to index the field data */ protected final void startField(String name, Field.Store store, Field.Index index) { fieldName = name; fieldStore = store; fieldIndex = index; fieldValue = new StringBuilder(); } /** * Starts a document field with the given value. * Note that the character data within the tag are ignored and the field * is added only if the value is not <tt>null</tt>. * @param name the name of the document field * @param value the value for the field * @param store flag whether to store the field data in the index * @param index flag whether to index the field data */ protected final void startField(String name, String value, Field.Store store, Field.Index index) { if (name != null && value != null) document.add(new Field(name, value, store, index)); } /** * Finishes the current field and adds it to the document. */ protected final void endField() { if (fieldValue != null) { document.add(new Field(fieldName, fieldValue.toString(), fieldStore, fieldIndex)); fieldName = null; fieldStore = null; fieldIndex = null; fieldValue = null; } } @Override public void startDocument() throws SAXException { document = new Document(); } @Override public void characters(char[] ch, int start, int length) throws SAXException { if (fieldValue != null) fieldValue.append(ch, start, length); } @Override public void endElement(String uri, String localName, String qName) throws SAXException { endField(); } /** * Returns the current Lucene document. * A new document is created whenever this handlers is used * to parse a new document, i.e. when {@link #startDocument()} is called. * @return the current Lucene document */ public Document getLuceneDocument() { return document; } /** * Releases the allocated Lucene document. */ public void releaseLuceneDocument() { document = null; } } messif/objects/text/lucene/handlers/CoPhIRAllInOneHandler.java 0 → 100644 +33 −0 Original line number Diff line number Diff line package messif.objects.text.lucene.handlers; import messif.objects.text.lucene.LuceneXmlFileHandler; import org.apache.lucene.document.Field; import org.xml.sax.Attributes; import org.xml.sax.SAXException; /** * Implementation of the {@link LuceneXmlFileHandler} that extracts the content * of certain CoPhIR text attributes and elements. * The content is then stored in a Lucene document in a single field "allinone". */ public class CoPhIRAllInOneHandler extends LuceneXmlFileHandler { @Override public void startElement(String nsURI, String strippedName, String tagName, Attributes attributes) throws SAXException { if (tagName.equalsIgnoreCase("mediauri")) { startField("uri", Field.Store.YES, Field.Index.NO); } else if (tagName.equals("title")) { startField("allinone", Field.Store.YES, Field.Index.ANALYZED); } else if (tagName.equals("description")) { startField("allinone", Field.Store.YES, Field.Index.ANALYZED); } else if (tagName.equals("tag")) { startField("allinone", Field.Store.YES, Field.Index.ANALYZED); } else if (tagName.equals("comment")) { startField("allinone", Field.Store.NO, Field.Index.ANALYZED); } else if (tagName.equals("owner")) { startField("allinone", attributes.getValue("username"), Field.Store.YES, Field.Index.ANALYZED); startField("allinone", attributes.getValue("realname"), Field.Store.YES, Field.Index.ANALYZED); } else if (tagName.equals("dates")) { startField("taken", attributes.getValue("taken"), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); } } } messif/objects/text/lucene/handlers/CoPhIRFieldsHandler.java 0 → 100644 +33 −0 Original line number Diff line number Diff line package messif.objects.text.lucene.handlers; import messif.objects.text.lucene.LuceneXmlFileHandler; import org.apache.lucene.document.Field; import org.xml.sax.Attributes; import org.xml.sax.SAXException; /** * Implementation of the {@link LuceneXmlFileHandler} that extracts the content * of certain CoPhIR text attributes and elements. * The content is then stored in a Lucene document in corresponding fields. */ public class CoPhIRFieldsHandler extends LuceneXmlFileHandler { @Override public void startElement(String nsURI, String strippedName, String tagName, Attributes attributes) throws SAXException { if (tagName.equalsIgnoreCase("mediauri")) { startField("uri", Field.Store.YES, Field.Index.NO); } else if (tagName.equals("title")) { startField("title", Field.Store.YES, Field.Index.ANALYZED); } else if (tagName.equals("description")) { startField("description", Field.Store.YES, Field.Index.ANALYZED); } else if (tagName.equals("tag")) { startField("tag", Field.Store.YES, Field.Index.ANALYZED); } else if (tagName.equals("comment")) { startField("comment", Field.Store.NO, Field.Index.ANALYZED); } else if (tagName.equals("owner")) { startField("username", attributes.getValue("username"), Field.Store.YES, Field.Index.ANALYZED); startField("realname", attributes.getValue("realname"), Field.Store.YES, Field.Index.ANALYZED); } else if (tagName.equals("dates")) { startField("taken", attributes.getValue("taken"), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); } } } Loading
messif/objects/text/lucene/LuceneAlgorithm.java 0 → 100644 +84 −0 Original line number Diff line number Diff line package messif.objects.text.lucene; import java.io.File; import java.io.IOException; import messif.algorithms.Algorithm; import messif.objects.NoDataObject; import messif.objects.impl.ObjectString; import messif.operations.query.KNNQueryOperation; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; /** * Encapsulation of a Lucene index into MESSIF algorithm. * Note that this algorithm works only with {@link ObjectString} objects * and the distance function is ignored. Note also, that this algorithm * is not dynamic, thus no data-manipulation operations are supported. * * @author xbatko */ public class LuceneAlgorithm extends Algorithm { /** class serial id for serialization */ private static final long serialVersionUID = 1L; /** Lucene index searcher */ private final Searcher searcher; /** Lucene query parser */ private final QueryParser queryParser; /** Name of the field that stores the document locator */ private final String locatorField; /** * Creates a new Lucene algorithm on a given index directory. * The index file must be created first (e.g. by using CoPhIRSAXAllInOneIndexer). * * @param indexDir the Lucene index directory * @param locatorField the name of the field in the indexed documents where the locator is stored * @param searchFields the names of the fields in the indexed documents that are used for searching * @throws CorruptIndexException if the index file is corrupted * @throws IOException if there was an error reading from the index file */ @AlgorithmConstructor(description="Lucene index", arguments={"index directory", "locator field name", "search field names array"}) public LuceneAlgorithm(File indexDir, String locatorField, String... searchFields) throws CorruptIndexException, IOException { super("Lucene index in " + indexDir); // Open the index this.searcher = new IndexSearcher(IndexReader.open(FSDirectory.open(indexDir), true)); // just searching, so read-only=true // Set the locator field this.locatorField = locatorField; // Initialize query parser StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); if (searchFields.length == 1) this.queryParser = new QueryParser(Version.LUCENE_CURRENT, searchFields[0], analyzer); else this.queryParser = new MultiFieldQueryParser(Version.LUCENE_CURRENT, searchFields, analyzer); } /** * Search the index and returns the results. * @param operation the kNN operation with {@link ObjectString} query object * @throws ParseException if there was an error parsing the search text * @throws IOException if there was an error reading data from the index */ public void knnSearch(KNNQueryOperation operation) throws ParseException, IOException { TopDocs topDocs = searcher.search(queryParser.parse(((ObjectString)operation.getQueryObject()).getStringData()), operation.getK()); for (ScoreDoc match : topDocs.scoreDocs) { Document doc = searcher.doc(match.doc); operation.addToAnswer(new NoDataObject(doc.get(locatorField)), match.score, null); } operation.endOperation(); } }
messif/objects/text/lucene/LuceneBuilder.java 0 → 100644 +88 −0 Original line number Diff line number Diff line package messif.objects.text.lucene; import java.io.Closeable; import java.io.File; import java.io.IOException; import java.io.InputStream; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import messif.utility.XmlFileProcessor; import messif.utility.XmlFileSearch; import messif.utility.reflection.InstantiatorSignature; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; /** * Processor for creating Lucene indexes. * @author xbatko */ public class LuceneBuilder implements XmlFileProcessor, Closeable { /** Index writer */ private final IndexWriter writer; /** XML document parser */ private final SAXParser parser; /** XML document parsing handler */ private final LuceneXmlFileHandler handler; /** * Creates a Lucene index builder. * @param indexDir the directory where the index will be created (add to, if it already exists) * @param handler the XML document parsing handler * @param analyzer the Lucene analyzer for text normalization * @throws Exception if there was a problem initializing the builder */ protected LuceneBuilder(File indexDir, LuceneXmlFileHandler handler, Analyzer analyzer) throws Exception { this.writer = new IndexWriter( FSDirectory.open(indexDir), analyzer == null ? new StandardAnalyzer(Version.LUCENE_CURRENT) : analyzer, true, // Create if not exists IndexWriter.MaxFieldLength.UNLIMITED ); this.parser = SAXParserFactory.newInstance().newSAXParser(); this.handler = handler; } public void xmlFileProcess(String source, InputStream xmlFileData) { try { parser.parse(xmlFileData, handler); writer.addDocument(handler.getLuceneDocument()); handler.releaseLuceneDocument(); } catch (Exception e) { System.err.print("Error processing '"); System.err.print(source); System.err.print("': "); System.err.println(e); } } public void close() throws IOException { writer.optimize(); writer.close(); } /** * Builds the Lucene index. * @param args index directory, handler instance, * @throws Exception if there was an error */ @SuppressWarnings("UseOfSystemOutOrSystemErr") public static void main(String[] args) throws Exception { if (args.length < 3) { System.err.println("Usage: " + LuceneBuilder.class.getName() + " <directory> <handler instance> <dir|TAR file> ..."); System.exit(1); } LuceneBuilder processor = new LuceneBuilder( new File(args[0]), InstantiatorSignature.createInstanceWithStringArgs(args[1], LuceneXmlFileHandler.class, null), null ); try { XmlFileSearch.process(processor, 2, args); } finally { processor.close(); } } }
messif/objects/text/lucene/LuceneXmlFileHandler.java 0 → 100644 +96 −0 Original line number Diff line number Diff line package messif.objects.text.lucene; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; /** * Abstract ancestor of all handlers that can parse a Lucene {@link Document}. * @author xbatko */ public abstract class LuceneXmlFileHandler extends DefaultHandler { /** Current Lucene document */ private Document document; /** Name of the document field currently filled from the XML file */ private String fieldName; /** Flag whether to store the currently filled field data in the index */ private Field.Store fieldStore; /** Flag whether to index the currently filled field */ private Field.Index fieldIndex; /** Data for the currently filled field */ private StringBuilder fieldValue; /** * Starts a document field with character data within the tag. * @param name the name of the document field * @param store flag whether to store the field data in the index * @param index flag whether to index the field data */ protected final void startField(String name, Field.Store store, Field.Index index) { fieldName = name; fieldStore = store; fieldIndex = index; fieldValue = new StringBuilder(); } /** * Starts a document field with the given value. * Note that the character data within the tag are ignored and the field * is added only if the value is not <tt>null</tt>. * @param name the name of the document field * @param value the value for the field * @param store flag whether to store the field data in the index * @param index flag whether to index the field data */ protected final void startField(String name, String value, Field.Store store, Field.Index index) { if (name != null && value != null) document.add(new Field(name, value, store, index)); } /** * Finishes the current field and adds it to the document. */ protected final void endField() { if (fieldValue != null) { document.add(new Field(fieldName, fieldValue.toString(), fieldStore, fieldIndex)); fieldName = null; fieldStore = null; fieldIndex = null; fieldValue = null; } } @Override public void startDocument() throws SAXException { document = new Document(); } @Override public void characters(char[] ch, int start, int length) throws SAXException { if (fieldValue != null) fieldValue.append(ch, start, length); } @Override public void endElement(String uri, String localName, String qName) throws SAXException { endField(); } /** * Returns the current Lucene document. * A new document is created whenever this handlers is used * to parse a new document, i.e. when {@link #startDocument()} is called. * @return the current Lucene document */ public Document getLuceneDocument() { return document; } /** * Releases the allocated Lucene document. */ public void releaseLuceneDocument() { document = null; } }
messif/objects/text/lucene/handlers/CoPhIRAllInOneHandler.java 0 → 100644 +33 −0 Original line number Diff line number Diff line package messif.objects.text.lucene.handlers; import messif.objects.text.lucene.LuceneXmlFileHandler; import org.apache.lucene.document.Field; import org.xml.sax.Attributes; import org.xml.sax.SAXException; /** * Implementation of the {@link LuceneXmlFileHandler} that extracts the content * of certain CoPhIR text attributes and elements. * The content is then stored in a Lucene document in a single field "allinone". */ public class CoPhIRAllInOneHandler extends LuceneXmlFileHandler { @Override public void startElement(String nsURI, String strippedName, String tagName, Attributes attributes) throws SAXException { if (tagName.equalsIgnoreCase("mediauri")) { startField("uri", Field.Store.YES, Field.Index.NO); } else if (tagName.equals("title")) { startField("allinone", Field.Store.YES, Field.Index.ANALYZED); } else if (tagName.equals("description")) { startField("allinone", Field.Store.YES, Field.Index.ANALYZED); } else if (tagName.equals("tag")) { startField("allinone", Field.Store.YES, Field.Index.ANALYZED); } else if (tagName.equals("comment")) { startField("allinone", Field.Store.NO, Field.Index.ANALYZED); } else if (tagName.equals("owner")) { startField("allinone", attributes.getValue("username"), Field.Store.YES, Field.Index.ANALYZED); startField("allinone", attributes.getValue("realname"), Field.Store.YES, Field.Index.ANALYZED); } else if (tagName.equals("dates")) { startField("taken", attributes.getValue("taken"), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); } } }
messif/objects/text/lucene/handlers/CoPhIRFieldsHandler.java 0 → 100644 +33 −0 Original line number Diff line number Diff line package messif.objects.text.lucene.handlers; import messif.objects.text.lucene.LuceneXmlFileHandler; import org.apache.lucene.document.Field; import org.xml.sax.Attributes; import org.xml.sax.SAXException; /** * Implementation of the {@link LuceneXmlFileHandler} that extracts the content * of certain CoPhIR text attributes and elements. * The content is then stored in a Lucene document in corresponding fields. */ public class CoPhIRFieldsHandler extends LuceneXmlFileHandler { @Override public void startElement(String nsURI, String strippedName, String tagName, Attributes attributes) throws SAXException { if (tagName.equalsIgnoreCase("mediauri")) { startField("uri", Field.Store.YES, Field.Index.NO); } else if (tagName.equals("title")) { startField("title", Field.Store.YES, Field.Index.ANALYZED); } else if (tagName.equals("description")) { startField("description", Field.Store.YES, Field.Index.ANALYZED); } else if (tagName.equals("tag")) { startField("tag", Field.Store.YES, Field.Index.ANALYZED); } else if (tagName.equals("comment")) { startField("comment", Field.Store.NO, Field.Index.ANALYZED); } else if (tagName.equals("owner")) { startField("username", attributes.getValue("username"), Field.Store.YES, Field.Index.ANALYZED); startField("realname", attributes.getValue("realname"), Field.Store.YES, Field.Index.ANALYZED); } else if (tagName.equals("dates")) { startField("taken", attributes.getValue("taken"), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); } } }