使用NLTK进行分词及词性标注

1、首先是安装
1.1、安装Python 3.4
注意要用32位版本
http://www.python.org/downloads/

1.2、安装Numpy
注意两点,一是不一定所有版本都有windows安装包,二是要找支持python3.4的安装包
http://sourceforge.net/projects/numpy/files/NumPy/

1.3、安装NLTK
注意3.2版本有bug,不要用。
http://pypi.python.org/pypi/nltk

2、下载NLT Data
方法1:
在python中运行:

import nltk
nltk.download()

方法2:
到下面的地址,直接去找链接,然后自己下载解压
https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml

3、进行分词
3.1、设置环境变量

set PYTHON_HOME=C:\NeoLanguages\Python34_x86
set PATH=%PYTHON_HOME%;%PATH%
set NLTK_DATA=D:\NLP\NLTK\nltk_data
@python

3.2、py文件

#!usr/bin/python

import nltk

#测试句子
sentence = "Don’t ever let somebody tell you you can’t do something, not even me. \
You got a dream, you gotta protect it. People can’t do something themselves, \
they wanna tell you you can’t do it. If you want something, go get it. Period."

#分词
tokens = nltk.word_tokenize(sentence)

#词性标注
tagged = nltk.pos_tag(tokens)

#句法分析
entities = nltk.chunk.ne_chunk(tagged)

3.3、逐句运行

D:\MyProjects\NLP\NLTK>python
Python 3.4.4 (v3.4.4:737efcadf5a6, Dec 20 2015, 19:28:18) [MSC v.1600 32 bit (In
tel)] on win32
Type "help", "copyright", "credits" or "license" for more information.
>>> import nltk
>>> sentence = "Don’t ever let somebody tell you you can’t do something, not e
ven me. \
... You got a dream, you gotta protect it. People can’t do something themselves
, \
... they wanna tell you you can’t do it. If you want something, go get it. Peri
od."
>>> tokens = nltk.word_tokenize(sentence)
>>> tagged = nltk.pos_tag(tokens)
>>> entities = nltk.chunk.ne_chunk(tagged)

>>> tokens
['Don’t', 'ever', 'let', 'somebody', 'tell', 'you', 'you', 'can’t', 'do', 'som
ething', ',', 'not', 'even', 'me', '.', 'You', 'got', 'a', 'dream', ',', 'you',
'got', 'ta', 'protect', 'it', '.', 'People', 'can’t', 'do', 'something', 'thems
elves', ',', 'they', 'wan', 'na', 'tell', 'you', 'you', 'can’t', 'do', 'it', '.
', 'If', 'you', 'want', 'something', ',', 'go', 'get', 'it', '.', 'Period', '.']

>>> tagged
[('Don’t', 'NNP'), ('ever', 'RB'), ('let', 'VB'), ('somebody', 'NN'), ('tell',
'VB'), ('you', 'PRP'), ('you', 'PRP'), ('can’t', 'VBP'), ('do', 'VB'), ('someth
ing', 'NN'), (',', ','), ('not', 'RB'), ('even', 'RB'), ('me', 'PRP'), ('.', '.'
), ('You', 'PRP'), ('got', 'VBD'), ('a', 'DT'), ('dream', 'NN'), (',', ','), ('y
ou', 'PRP'), ('got', 'VBD'), ('ta', 'JJ'), ('protect', 'NN'), ('it', 'PRP'), ('.
', '.'), ('People', 'NNS'), ('can’t', 'VBP'), ('do', 'VBP'), ('something', 'NN'
), ('themselves', 'PRP'), (',', ','), ('they', 'PRP'), ('wan', 'VBP'), ('na', 'T
O'), ('tell', 'VB'), ('you', 'PRP'), ('you', 'PRP'), ('can’t', 'VBP'), ('do', '
VB'), ('it', 'PRP'), ('.', '.'), ('If', 'IN'), ('you', 'PRP'), ('want', 'VBP'),
('something', 'NN'), (',', ','), ('go', 'VBP'), ('get', 'VB'), ('it', 'PRP'), ('
.', '.'), ('Period', 'NNP'), ('.', '.')]

>>> entities
Tree('S', [('Don’t', 'NNP'), ('ever', 'RB'), ('let', 'VB'), ('somebody', 'NN'),
 ('tell', 'VB'), ('you', 'PRP'), ('you', 'PRP'), ('can’t', 'VBP'), ('do', 'VB')
, ('something', 'NN'), (',', ','), ('not', 'RB'), ('even', 'RB'), ('me', 'PRP'),
 ('.', '.'), ('You', 'PRP'), ('got', 'VBD'), ('a', 'DT'), ('dream', 'NN'), (',',
 ','), ('you', 'PRP'), ('got', 'VBD'), ('ta', 'JJ'), ('protect', 'NN'), ('it', '
PRP'), ('.', '.'), ('People', 'NNS'), ('can’t', 'VBP'), ('do', 'VBP'), ('someth
ing', 'NN'), ('themselves', 'PRP'), (',', ','), ('they', 'PRP'), ('wan', 'VBP'),
 ('na', 'TO'), ('tell', 'VB'), ('you', 'PRP'), ('you', 'PRP'), ('can’t', 'VBP')
, ('do', 'VB'), ('it', 'PRP'), ('.', '.'), ('If', 'IN'), ('you', 'PRP'), ('want'
, 'VBP'), ('something', 'NN'), (',', ','), ('go', 'VBP'), ('get', 'VB'), ('it',
'PRP'), ('.', '.'), Tree('PERSON', [('Period', 'NNP')]), ('.', '.')])
>>>