Commit 29c66419 authored by Michal Batko's avatar Michal Batko
Browse files

* Removed special exception handling for XmlFile processing

* Added Lucene support (note that lucene-core library is needed)
parent 81413841
Loading
Loading
Loading
Loading
+84 −0
Original line number Diff line number Diff line
package messif.objects.text.lucene;

import java.io.File;
import java.io.IOException;
import messif.algorithms.Algorithm;
import messif.objects.NoDataObject;
import messif.objects.impl.ObjectString;
import messif.operations.query.KNNQueryOperation;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

/**
 * Encapsulation of a Lucene index into MESSIF algorithm.
 * Note that this algorithm works only with {@link ObjectString} objects
 * and the distance function is ignored. Note also, that this algorithm
 * is not dynamic, thus no data-manipulation operations are supported.
 * 
 * @author xbatko
 */
public class LuceneAlgorithm extends Algorithm {
    /** class serial id for serialization */
    private static final long serialVersionUID = 1L;

    /** Lucene index searcher */
    private final Searcher searcher;
    /** Lucene query parser */
    private final QueryParser queryParser;
    /** Name of the field that stores the document locator */
    private final String locatorField;

    /**
     * Creates a new Lucene algorithm on a given index directory.
     * The index file must be created first (e.g. by using CoPhIRSAXAllInOneIndexer).
     * 
     * @param indexDir the Lucene index directory
     * @param locatorField the name of the field in the indexed documents where the locator is stored
     * @param searchFields the names of the fields in the indexed documents that are used for searching
     * @throws CorruptIndexException if the index file is corrupted
     * @throws IOException if there was an error reading from the index file
     */
    @AlgorithmConstructor(description="Lucene index", arguments={"index directory", "locator field name", "search field names array"})
    public LuceneAlgorithm(File indexDir, String locatorField, String... searchFields) throws CorruptIndexException, IOException {
        super("Lucene index in " + indexDir);

        // Open the index
        this.searcher = new IndexSearcher(IndexReader.open(FSDirectory.open(indexDir), true)); // just searching, so read-only=true

        // Set the locator field
        this.locatorField = locatorField;

        // Initialize query parser
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
        if (searchFields.length == 1)
            this.queryParser = new QueryParser(Version.LUCENE_CURRENT, searchFields[0], analyzer);
        else
            this.queryParser = new MultiFieldQueryParser(Version.LUCENE_CURRENT, searchFields, analyzer);
    }

    /**
     * Search the index and returns the results.
     * @param operation the kNN operation with {@link ObjectString} query object
     * @throws ParseException if there was an error parsing the search text
     * @throws IOException if there was an error reading data from the index
     */
    public void knnSearch(KNNQueryOperation operation) throws ParseException, IOException {
        TopDocs topDocs = searcher.search(queryParser.parse(((ObjectString)operation.getQueryObject()).getStringData()), operation.getK());
        for (ScoreDoc match : topDocs.scoreDocs) {
            Document doc = searcher.doc(match.doc);
            operation.addToAnswer(new NoDataObject(doc.get(locatorField)), match.score, null);
        }
        operation.endOperation();
    }
}
+88 −0
Original line number Diff line number Diff line
package messif.objects.text.lucene;

import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import messif.utility.XmlFileProcessor;
import messif.utility.XmlFileSearch;
import messif.utility.reflection.InstantiatorSignature;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

/**
 * Processor for creating Lucene indexes.
 * @author xbatko
 */
public class LuceneBuilder implements XmlFileProcessor, Closeable {
    /** Index writer */
    private final IndexWriter writer;
    /** XML document parser */
    private final SAXParser parser;
    /** XML document parsing handler */
    private final LuceneXmlFileHandler handler;

    /**
     * Creates a Lucene index builder.
     * @param indexDir the directory where the index will be created (add to, if it already exists)
     * @param handler the XML document parsing handler
     * @param analyzer the Lucene analyzer for text normalization
     * @throws Exception if there was a problem initializing the builder
     */
    protected LuceneBuilder(File indexDir, LuceneXmlFileHandler handler, Analyzer analyzer) throws Exception {
        this.writer = new IndexWriter(
                FSDirectory.open(indexDir),
                analyzer == null ? new StandardAnalyzer(Version.LUCENE_CURRENT) : analyzer,
                true, // Create if not exists
                IndexWriter.MaxFieldLength.UNLIMITED
        );
        this.parser = SAXParserFactory.newInstance().newSAXParser();
        this.handler = handler;
    }

    public void xmlFileProcess(String source, InputStream xmlFileData) {
        try {
            parser.parse(xmlFileData, handler);
            writer.addDocument(handler.getLuceneDocument());
            handler.releaseLuceneDocument();
        } catch (Exception e) {
            System.err.print("Error processing '");
            System.err.print(source);
            System.err.print("': ");
            System.err.println(e);
        }        
    }

    public void close() throws IOException {
        writer.optimize();
        writer.close();        
    }

    /**
     * Builds the Lucene index.
     * @param args index directory, handler instance,
     * @throws Exception if there was an error 
     */
    @SuppressWarnings("UseOfSystemOutOrSystemErr")
    public static void main(String[] args) throws Exception {
        if (args.length < 3) {
            System.err.println("Usage: " + LuceneBuilder.class.getName() + " <directory> <handler instance> <dir|TAR file> ...");
            System.exit(1);
        }
        LuceneBuilder processor = new LuceneBuilder(
                new File(args[0]),
                InstantiatorSignature.createInstanceWithStringArgs(args[1], LuceneXmlFileHandler.class, null),
                null
        );
        try {
            XmlFileSearch.process(processor, 2, args);
        } finally {
            processor.close();
        }
    }
}
+96 −0
Original line number Diff line number Diff line
package messif.objects.text.lucene;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/**
 * Abstract ancestor of all handlers that can parse a Lucene {@link Document}.
 * @author xbatko
 */
public abstract class LuceneXmlFileHandler extends DefaultHandler {
    /** Current Lucene document */
    private Document document;
    /** Name of the document field currently filled from the XML file */
    private String fieldName;
    /** Flag whether to store the currently filled field data in the index */
    private Field.Store fieldStore;
    /** Flag whether to index the currently filled field */
    private Field.Index fieldIndex;
    /** Data for the currently filled field */
    private StringBuilder fieldValue;

    /**
     * Starts a document field with character data within the tag.
     * @param name the name of the document field
     * @param store flag whether to store the field data in the index
     * @param index flag whether to index the field data
     */
    protected final void startField(String name, Field.Store store, Field.Index index) {
        fieldName = name;
        fieldStore = store;
        fieldIndex = index;
        fieldValue = new StringBuilder();
    }

    /**
     * Starts a document field with the given value.
     * Note that the character data within the tag are ignored and the field
     * is added only if the value is not <tt>null</tt>.
     * @param name the name of the document field
     * @param value the value for the field
     * @param store flag whether to store the field data in the index
     * @param index flag whether to index the field data
     */
    protected final void startField(String name, String value, Field.Store store, Field.Index index) {
        if (name != null && value != null)
            document.add(new Field(name, value, store, index));
    }

    /**
     * Finishes the current field and adds it to the document.
     */
    protected final void endField() {
        if (fieldValue != null) {
            document.add(new Field(fieldName, fieldValue.toString(), fieldStore, fieldIndex));
            fieldName = null;
            fieldStore = null;
            fieldIndex = null;
            fieldValue = null;
        }
    }

    @Override
    public void startDocument() throws SAXException {
        document = new Document();
    }

    @Override
    public void characters(char[] ch, int start, int length) throws SAXException {
        if (fieldValue != null)
            fieldValue.append(ch, start, length);
    }

    @Override
    public void endElement(String uri, String localName, String qName) throws SAXException {
        endField();
    }

    /**
     * Returns the current Lucene document.
     * A new document is created whenever this handlers is used
     * to parse a new document, i.e. when {@link #startDocument()} is called.
     * @return the current Lucene document
     */
    public Document getLuceneDocument() {
        return document;
    }

    /**
     * Releases the allocated Lucene document.
     */
    public void releaseLuceneDocument() {
        document = null;
    }
}
+33 −0
Original line number Diff line number Diff line
package messif.objects.text.lucene.handlers;

import messif.objects.text.lucene.LuceneXmlFileHandler;
import org.apache.lucene.document.Field;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;

/**
 * Implementation of the {@link LuceneXmlFileHandler} that extracts the content
 * of certain CoPhIR text attributes and elements.
 * The content is then stored in a Lucene document in a single field "allinone".
 */
public class CoPhIRAllInOneHandler extends LuceneXmlFileHandler {
    @Override
    public void startElement(String nsURI, String strippedName, String tagName, Attributes attributes) throws SAXException {
        if (tagName.equalsIgnoreCase("mediauri")) {
            startField("uri", Field.Store.YES, Field.Index.NO);
        } else if (tagName.equals("title")) {
            startField("allinone", Field.Store.YES, Field.Index.ANALYZED);
        } else if (tagName.equals("description")) {
            startField("allinone", Field.Store.YES, Field.Index.ANALYZED);
        } else if (tagName.equals("tag")) {
            startField("allinone", Field.Store.YES, Field.Index.ANALYZED);
        } else if (tagName.equals("comment")) {
            startField("allinone", Field.Store.NO, Field.Index.ANALYZED);
        } else if (tagName.equals("owner")) {
            startField("allinone", attributes.getValue("username"), Field.Store.YES, Field.Index.ANALYZED);
            startField("allinone", attributes.getValue("realname"), Field.Store.YES, Field.Index.ANALYZED);
        } else if (tagName.equals("dates")) {
            startField("taken", attributes.getValue("taken"), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
        }
    }
}
+33 −0
Original line number Diff line number Diff line
package messif.objects.text.lucene.handlers;

import messif.objects.text.lucene.LuceneXmlFileHandler;
import org.apache.lucene.document.Field;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;

/**
 * Implementation of the {@link LuceneXmlFileHandler} that extracts the content
 * of certain CoPhIR text attributes and elements.
 * The content is then stored in a Lucene document in corresponding fields.
 */
public class CoPhIRFieldsHandler extends LuceneXmlFileHandler {
    @Override
    public void startElement(String nsURI, String strippedName, String tagName, Attributes attributes) throws SAXException {
        if (tagName.equalsIgnoreCase("mediauri")) {
            startField("uri", Field.Store.YES, Field.Index.NO);
        } else if (tagName.equals("title")) {
            startField("title", Field.Store.YES, Field.Index.ANALYZED);
        } else if (tagName.equals("description")) {
            startField("description", Field.Store.YES, Field.Index.ANALYZED);
        } else if (tagName.equals("tag")) {
            startField("tag", Field.Store.YES, Field.Index.ANALYZED);
        } else if (tagName.equals("comment")) {
            startField("comment", Field.Store.NO, Field.Index.ANALYZED);
        } else if (tagName.equals("owner")) {
            startField("username", attributes.getValue("username"), Field.Store.YES, Field.Index.ANALYZED);
            startField("realname", attributes.getValue("realname"), Field.Store.YES, Field.Index.ANALYZED);
        } else if (tagName.equals("dates")) {
            startField("taken", attributes.getValue("taken"), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
        }
    }
}
Loading