Some code snippets to illustrate the use of Mtas directly with Apache Lucene.
Create index
Create an index with three folia files using configuration file folia.xml
String configFile = "folia.xml"; HashMap<String,String> files = new HashMap<String,String>(); files.put("title 1","resource1.xml.gz"); files.put("title 2","resource2.xml.gz"); files.put("title 3","resource3.xml.gz"); CreateIndex createIndex = new CreateIndex(configFile, files);
Basic search
With the created index and for CQL expression [pos="LID"]
String cql = "[pos=\"LID\"]"; Directory directory = createIndex.getDirectory();
the number of hits in each document can be computed with
IndexReader indexReader = DirectoryReader.open(directory); MtasSpanQuery q = createQuery(CreateIndex.FIELD_CONTENT, cql, null, null); ListIterator<LeafReaderContext> iterator = indexReader.leaves() .listIterator(); IndexSearcher searcher = new IndexSearcher(indexReader); SpanWeight spanweight = ((MtasSpanQuery) q.rewrite(indexReader)) .createWeight(searcher, false); while (iterator.hasNext()) { LeafReaderContext lrc = iterator.next(); Spans spans = spanweight.getSpans(lrc, SpanWeight.Postings.POSITIONS); SegmentReader r = (SegmentReader) lrc.reader(); if (spans != null) { while (spans.nextDoc() != Spans.NO_MORE_DOCS) { if (r.numDocs()==r.maxDoc() || r.getLiveDocs().get(spans.docID())) { System.out.print("Document "+(lrc.docBase+spans.docID())+": "); int hits = 0; while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) { hits++; } System.out.println(hits+" hits in '"+r.document((lrc.docBase+spans.docID())).get(CreateIndex.FIELD_TITLE)+"'"); } } } } indexReader.close();
Advanced search
By using the provided collect method, also more advanced options are available, like computing the termvector
IndexReader indexReader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(indexReader); ComponentField fieldStats = new ComponentField(CreateIndex.FIELD_CONTENT, CreateIndex.FIELD_ID); ArrayList<Integer> fullDocSet = new ArrayList<Integer>(Arrays.asList(new Integer[]{0,1,2})); ArrayList<Integer> fullDocList = new ArrayList<Integer>(); try { fieldStats.termVectorList.add(new ComponentTermVector("wordList", "t", null, false, "n,sum", CodecUtil.STATS_TYPE_SUM, CodecUtil.SORT_DESC, null, 10, null, null, null, null, null, null, null, null, null)); CodecUtil.collect(CreateIndex.FIELD_CONTENT, searcher, indexReader, fullDocList, fullDocSet, fieldStats); for (ComponentTermVector ct : fieldStats.termVectorList) { HashMap<String, Map<String, Object>> tvList = new HashMap<String, Map<String, Object>>(); Map<String, ?> tcList = ct.subComponentFunction.dataCollector .getResult().getList(); for (String key : tcList.keySet()) { tvList.put(key, ((MtasDataItem<?, ?>) tcList.get(key)).rewrite(false)); } System.out.println(tvList); } } catch (IllegalAccessException | IllegalArgumentException | InvocationTargetException | mtas.parser.function.ParseException e) { e.printStackTrace(); }
Appendix
Code class CreateIndex
import java.io.IOException; import java.nio.file.Paths; import java.util.HashMap; import java.util.Map; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.custom.CustomAnalyzer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.RAMDirectory; public class CreateIndex { public static String FIELD_ID = "id"; public static String FIELD_TITLE = "title"; public static String FIELD_CONTENT = "content"; private Directory directory; public CreateIndex(String configFile, HashMap<String, String> files) throws IOException { this(null, configFile, files); } public CreateIndex(String indexPath, String configFile, HashMap<String, String> files) throws IOException { initialize(null, configFile, files); } public Directory getDirectory() { return directory; } private void initialize(String indexPath, String configFile, HashMap<String, String> files) throws IOException { if (indexPath != null) { directory = FSDirectory.open(Paths.get(indexPath)); } else { directory = new RAMDirectory(); } Map<String, String> paramsCharFilterMtas = new HashMap<String, String>(); paramsCharFilterMtas.put("type", "file"); Map<String, String> paramsTokenizer = new HashMap<String, String>(); paramsTokenizer.put("configFile", configFile); Analyzer mtasAnalyzer = CustomAnalyzer .builder(Paths.get("docker").toAbsolutePath()) .addCharFilter("mtas", paramsCharFilterMtas) .withTokenizer("mtas", paramsTokenizer).build(); Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>(); analyzerPerField.put(FIELD_CONTENT, mtasAnalyzer); PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper( new StandardAnalyzer(), analyzerPerField); IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setUseCompoundFile(false); config.setCodec(Codec.forName("MtasCodec")); IndexWriter w = new IndexWriter(directory, config); w.deleteAll(); int counter = 0; for (String title : files.keySet()) { Document doc = new Document(); doc.add(new StringField(FIELD_ID, Integer.valueOf(counter).toString(), Field.Store.YES)); doc.add(new StringField(FIELD_TITLE, title, Field.Store.YES)); doc.add(new TextField(FIELD_CONTENT, files.get(title), Field.Store.YES)); w.addDocument(doc); counter++; } w.commit(); w.close(); } }
Code method createQuery
MtasSpanQuery createQuery(String field, String cql, MtasSpanQuery ignore, Integer maximumIgnoreLength) throws ParseException { Reader reader = new BufferedReader(new StringReader(cql)); MtasCQLParser p = new MtasCQLParser(reader); return p.parse(field, null, null, ignore, maximumIgnoreLength); }