#!/usr/bin/env python INDEX_FILENAME = "IndexFiles.index" import sys, os, lucene from java.io import File from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.index import DirectoryReader from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.search import IndexSearcher from org.apache.lucene.util import Version from org.apache.lucene.search import ScoreDoc from org.apache.lucene.search.similarities import BM25Similarity from org.apache.lucene.search.similarities import TFIDFSimilarity def run(searcher, analyzer, runtag): # open the standard input to read queries fin = open('/dev/stdin','r') # read the first queryes line = fin.readline() # while the file is not finished while len(line)>0: # split in query id and query text string = line.split("\t"); # strip newline from query text string[len(string)-1] = string[len(string)-1].strip() query_text = string[1] query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(query_text) # get the top 1000 documents topDocs = searcher.search(query, 1000) rank = 1 # for each retrieved doc scoreDocs = topDocs.scoreDocs for scoreDoc in scoreDocs: # get doc info doc = searcher.doc(scoreDoc.doc) # remove ".txt" from the doc name docname = (doc.get("name").split("."))[0] print string[0],'\tQ0\t',docname,'\t',rank,'\t',scoreDoc.score,'\t',runtag rank = rank+1 # read next query line = fin.readline() # init Java Virtual Machine lucene.initVM(vmargs=['-Djava.awt.headless=true']) # get the absolute path of this script base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) # get the absolute path of the index index_dir = SimpleFSDirectory(File(os.path.join(base_dir, INDEX_FILENAME))) # define a searcher for the index searcher = IndexSearcher(DirectoryReader.open(index_dir)) # set the standard query analyzer analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # run searcher using the analyzer runtag = sys.argv[3] run(searcher, analyzer, runtag) del searcher