4 Jan 2011 18:48
Re: parsing Java log file with Lucene 3.0.3
Benzion G <benzionk <at> yahoo.com>
2011-01-04 17:48:01 GMT
2011-01-04 17:48:01 GMT
OK, I succeeded to write an Analyzer I need. I can't say that I understood
all Lucene Analyzer-Tokenizer-Filter logic, but here's attached MyAnalyzer.
Hope it will help somebody else.
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
public class MyAnalyzer extends Analyzer
{
public TokenStream tokenStream(String field, final Reader reader)
{
TokenStream result = new MyCharTokenizer(reader);
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
result = new StopFilter(true, result,
StopAnalyzer.ENGLISH_STOP_WORDS_SET);
return result;
}
static class MyCharTokenizer extends CharTokenizer
{
public static final char[] BAD_CHARACTERS =
{ '.', ',', ':', '(', ')', ' ', '[', ']', ';', '\'', '"', '|', '-', '_',
'*', '<', '>', '=', '+', '%', '#', '~', '`', '^'};
public MyCharTokenizer(Reader input)
{
super(input);
}
<at> Override
protected boolean isTokenChar(char paramChar)
{
if (Character.isLetterOrDigit(paramChar))
{
return true;
}
else
{
return false;
}
//if you need to filter out specific characters and not just
non-digits-or-letters as above
//for (int i = 0; i < BAD_CHARACTERS.length; i++)
//{
// if (BAD_CHARACTERS[i] == paramChar)
// {
// return false;
// }
//}
//return true;
}
}
}
--
--
View this message in context: http://lucene.472066.n3.nabble.com/parsing-Java-log-file-with-Lucene-3-0-3-tp2173046p2193022.html
Sent from the Lucene - Java Users mailing list archive at Nabble.com.
RSS Feed