BASIS TECHNOLOGY の Rosette と Kuromoji で すもももももももものうち やってみた メモ

#!/usr/local/jython/bin/jython
# -*- coding: utf-8 -*-

# Kuromoji と Rosette のパース対決

# Kuromoji
from com.atilika.kuromoji.unidic import Tokenizer

# BASIS TECHNOLOGY Rosette 
from com.basistech.util import Pathnames
from com.basistech.util import LanguageCode
from com.basistech.rlp import EnvironmentParameters
from com.basistech.rlp import RLPEnvironment
from com.basistech.rlp import ContextParameters
from com.basistech.rlp import TokenIteratorResultAccess
from com.basistech.rlp import ResultAccess
from com.basistech.rlp import TokenData

from java.io import File
import sys, os

if __name__ == "__main__":
 parseWord = u"すもももももももものうち"

 # Kuromoji ↓
 tokenizer = Tokenizer()
 tokens = tokenizer.tokenize( parseWord )

 print "\nKuromoji"
 for token in tokens:
  print token.getSurface() + "\t" + token.getAllFeatures()
 # Kuromoji ↑

 # BASIS TECHNOLOGY Rosette ↓
 # パラメータ設定 #
 btRoot = "/hoge/BasisTech"
 Pathnames.setBTRootDirectory( btRoot )
 envParams = EnvironmentParameters()
 environmentPath = btRoot + "/rlp/etc/rlp-environment.xml"
 envParams.setEnvironmentDefinition( File( environmentPath ) )
 rlpEnv = RLPEnvironment( envParams )
 rlpEnv.initialize()
 contextParam = ContextParameters()
 contextPath = btRoot + "/rlp/samples/etc/rlp-bl-context.xml"
 contextParam.setContextDefinition( File(contextPath) )
 rlpContext = rlpEnv.getContext(contextParam)
 rlpContext.setProperty("com.basistech.jsonw.skip", "true")

 # 形態素解析
 rlpContext.process(parseWord, LanguageCode.UNKNOWN)

 # 形態素解析結果の取り出し
 resultAccess = ResultAccess(rlpContext)
 tokenResultAccess = TokenIteratorResultAccess( resultAccess )
 tokenData = TokenData()

 print "\nBASIS TECHNOLOGY Rosette"
 while tokenResultAccess.next(tokenData):
  print tokenData.getText() +'\t',

  if tokenData.getPartOfSpeech():
   print tokenData.getPartOfSpeech(),

  if tokenData.getLemma():
   print tokenData.getLemma(),

  print
 # BASIS TECHNOLOGY Rosette ↑

Kuromoji
すもも  名詞,普通名詞,一般,*,*,*,スモモ,李,すもも,スモモ,すもも,スモモ,和,*,*,*,*
も      助詞,係助詞,*,*,*,*,モ,も,も,モ,も,モ,和,*,*,*,*
もも    名詞,普通名詞,一般,*,*,*,モモ,桃,もも,モモ,もも,モモ,和,*,*,*,*
も      助詞,係助詞,*,*,*,*,モ,も,も,モ,も,モ,和,*,*,*,*
もも    名詞,普通名詞,一般,*,*,*,モモ,桃,もも,モモ,もも,モモ,和,*,*,*,*
の      助詞,格助詞,*,*,*,*,ノ,の,の,ノ,の,ノ,和,*,*,*,*
うち    名詞,普通名詞,副詞可能,*,*,*,ウチ,内,うち,ウチ,うち,ウチ,和,*,*,*,*

BASIS TECHNOLOGY Rosette
すもも  NC
もも    NC
もも    NC
も      PL
もの    PL
うち    V うつ