#!/usr/local/jython/bin/jython # -*- coding: utf-8 -*- # Kuromoji と Rosette のパース対決 # Kuromoji from com.atilika.kuromoji.unidic import Tokenizer # BASIS TECHNOLOGY Rosette from com.basistech.util import Pathnames from com.basistech.util import LanguageCode from com.basistech.rlp import EnvironmentParameters from com.basistech.rlp import RLPEnvironment from com.basistech.rlp import ContextParameters from com.basistech.rlp import TokenIteratorResultAccess from com.basistech.rlp import ResultAccess from com.basistech.rlp import TokenData from java.io import File import sys, os if __name__ == "__main__": parseWord = u"すもももももももものうち" # Kuromoji ↓ tokenizer = Tokenizer() tokens = tokenizer.tokenize( parseWord ) print "\nKuromoji" for token in tokens: print token.getSurface() + "\t" + token.getAllFeatures() # Kuromoji ↑ # BASIS TECHNOLOGY Rosette ↓ # パラメータ設定 # btRoot = "/hoge/BasisTech" Pathnames.setBTRootDirectory( btRoot ) envParams = EnvironmentParameters() environmentPath = btRoot + "/rlp/etc/rlp-environment.xml" envParams.setEnvironmentDefinition( File( environmentPath ) ) rlpEnv = RLPEnvironment( envParams ) rlpEnv.initialize() contextParam = ContextParameters() contextPath = btRoot + "/rlp/samples/etc/rlp-bl-context.xml" contextParam.setContextDefinition( File(contextPath) ) rlpContext = rlpEnv.getContext(contextParam) rlpContext.setProperty("com.basistech.jsonw.skip", "true") # 形態素解析 rlpContext.process(parseWord, LanguageCode.UNKNOWN) # 形態素解析結果の取り出し resultAccess = ResultAccess(rlpContext) tokenResultAccess = TokenIteratorResultAccess( resultAccess ) tokenData = TokenData() print "\nBASIS TECHNOLOGY Rosette" while tokenResultAccess.next(tokenData): print tokenData.getText() +'\t', if tokenData.getPartOfSpeech(): print tokenData.getPartOfSpeech(), if tokenData.getLemma(): print tokenData.getLemma(), print # BASIS TECHNOLOGY Rosette ↑
Kuromoji すもも 名詞,普通名詞,一般,*,*,*,スモモ,李,すもも,スモモ,すもも,スモモ,和,*,*,*,* も 助詞,係助詞,*,*,*,*,モ,も,も,モ,も,モ,和,*,*,*,* もも 名詞,普通名詞,一般,*,*,*,モモ,桃,もも,モモ,もも,モモ,和,*,*,*,* も 助詞,係助詞,*,*,*,*,モ,も,も,モ,も,モ,和,*,*,*,* もも 名詞,普通名詞,一般,*,*,*,モモ,桃,もも,モモ,もも,モモ,和,*,*,*,* の 助詞,格助詞,*,*,*,*,ノ,の,の,ノ,の,ノ,和,*,*,*,* うち 名詞,普通名詞,副詞可能,*,*,*,ウチ,内,うち,ウチ,うち,ウチ,和,*,*,*,* BASIS TECHNOLOGY Rosette すもも NC もも NC もも NC も PL もの PL うち V うつ