--- core/src/dualist/pipes/DocumentPipe.java.orig 2012-02-11 05:07:28.000000000 +0900
+++ core/src/dualist/pipes/DocumentPipe.java 2012-02-22 22:44:45.000000000 +0900
@@ -13,6 +13,8 @@
import cc.mallet.pipe.TokenSequenceRemoveStopwords;
import cc.mallet.types.Instance;
+import dualist.pipes.SimpleMecabPipe;
+
public class DocumentPipe extends Pipe {
private Pipe myPipe = new SerialPipes(new Pipe[] {
@@ -24,6 +26,9 @@
new CharSequenceReplace(Pattern.compile("&(.*?);"), ""),
new CharSequenceReplace(Pattern.compile("[0-9]+"), "00"),
new CharSequenceLowercase(),
+ (System.getProperty("dualist.lang") != null &&
+ System.getProperty("dualist.lang").equals("ja")) ?
+ new SimpleMecabPipe() :
// new CharSequence2TokenSequence(CharSequenceLexer.LEX_WORD_CLASSES),
new CharSequence2TokenSequence("[\\p{L}\\p{Mn}]+"),
new TokenSequenceRemoveStopwords(),
--- build.xml.orig 2012-03-08 23:07:56.000000000 +0900
+++ build.xml 2012-03-09 09:32:14.000000000 +0900
@@ -26,7 +26,7 @@
-
+