您现在的位置是:主页 > news > 做网站送邮箱/使用网站模板快速建站
做网站送邮箱/使用网站模板快速建站
admin2025/5/20 13:20:20【news】
简介做网站送邮箱,使用网站模板快速建站,苏州外贸网站制作,限制个人做网站本文整理汇总了Python中jieba.posseg.lcut方法的典型用法代码示例。如果您正苦于以下问题:Python posseg.lcut方法的具体用法?Python posseg.lcut怎么用?Python posseg.lcut使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以…
本文整理汇总了Python中jieba.posseg.lcut方法的典型用法代码示例。如果您正苦于以下问题:Python posseg.lcut方法的具体用法?Python posseg.lcut怎么用?Python posseg.lcut使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在模块jieba.posseg的用法示例。
在下文中一共展示了posseg.lcut方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: extract_dictionary_feature
点赞 3
# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import lcut [as 别名]
def extract_dictionary_feature(file_name, col_tag=0, col_content=1):
# ????
adv = codecs.open('./data/vocabulary/adv.txt', 'rb', encoding='utf-8').read().split('\n')
inverse = codecs.open('./data/vocabulary/inverse.txt', 'rb', encoding='utf-8').read().split('\n')
negdict = codecs.open('./data/vocabulary/negdict.txt', 'rb', encoding='utf-8').read().split('\n')
posdict = codecs.open('./data/vocabulary/posdict.txt', 'rb', encoding='utf-8').read().split('\n')
contents = pd.read_excel(file_name, header=None)
print 'cut words...'
cw = lambda x: [pair for pair in psg.lcut(x) if pair.word not in stopwords]
contents['pairs'] = contents[col_content].apply(cw)
matrix = reviews2matrix(list(contents['pairs']), posdict, negdict, inverse, adv)
x = matrix2vec(matrix)
y = list(contents[col_tag])
return x, y
开发者ID:wac81,项目名称:Book_DeepLearning_Practice,代码行数:18,
示例2: delNOTNeedWords
点赞 3
# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import lcut [as 别名]
def delNOTNeedWords(content,customstopwords=None):
# words = jieba.lcut(content)
if customstopwords == None:
customstopwords = "stopwords.txt"
import os
if os.path.exists(customstopwords):
stop_words = codecs.open(customstopwords, encoding='UTF-8').read().split(u'\n')
customstopwords = stop_words
result=''
return_words = []
# for w in words:
# if w not in stopwords:
# result += w.encode('utf-8') # +"/"+str(w.flag)+" " #????
words = pseg.lcut(content)
for word, flag in words:
# print word.encode('utf-8')
tempword = word.encode('utf-8').strip(' ')
if (word not in customstopwords and len(tempword)>0 and flag in [u'n',u'nr',u'ns',u'nt',u'nz',u'ng',u't',u'tg',u'f',u'v',u'vd',u'vn',u'vf',u'vx',u'vi',u'vl',u'vg', u'a',u'an',u'ag',u'al',u'm',u'mq',u'o',u'x']):
# and flag[0] in [u'n', u'f', u'a', u'z']):
# ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #??????????????????
result += tempword # +"/"+str(w.flag)+" " #????
return_words.append(tempword)
return result,return_words
开发者ID:wac81,项目名称:Book_DeepLearning_Practice,代码行数:27,
示例3: delNOTNeedWords
点赞 3
# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import lcut [as 别名]
def delNOTNeedWords(content,stopwords):
# words = jieba.lcut(content)
result=''
# for w in words:
# if w not in stopwords:
# result += w.encode('utf-8') # +"/"+str(w.flag)+" " #????
words = pseg.lcut(content)
# jieba.cut()
text_list = []
for word, flag in words:
# print word.encode('utf-8')
if (word not in stopwords and flag not in ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #??????????????????
# text_list.append(word.encode('utf-8'))
result += word.encode('utf-8') # +"/"+str(w.flag)+" " #????
# ''.join(text_list)
return result
# return ''.join(text_list)
开发者ID:wac81,项目名称:recommended_system,代码行数:20,代码来源:ar.py
示例4: jieba_example
点赞 2
# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import lcut [as 别名]
def jieba_example():
raw = "????S5????,123,?,?"
raw_seq = jieba.cut(raw)
raw_seq_list = jieba.lcut(raw)
raw_keyword = jieba.analyse.extract_tags(raw, topK=3, withWeight=False, allowPOS=())
raw_with_ictclas = pseg.cut(raw)
for word, flag in raw_with_ictclas:
print word, flag
开发者ID:roliygu,项目名称:CNKICrawler,代码行数:10,
示例5: cut_with_flag
点赞 2
# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import lcut [as 别名]
def cut_with_flag(raw_str, filter_invalid_word_flag=True):
"""
:param raw_str: str
:return: list[(str, str)]
"""
res = [(a, b) for a, b in pseg.lcut(raw_str)]
if filter_invalid_word_flag:
return filter_invalid_word(res)
else:
return res
开发者ID:roliygu,项目名称:CNKICrawler,代码行数:14,
示例6: maxSimTxt
点赞 2
# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import lcut [as 别名]
def maxSimTxt(self, intxt, simCondision=0.1, simType='simple'):
"""
????????????????????
simType=simple, simple_POS, vec
"""
self.lastTxt.append(intxt)
if simType not in ('simple', 'simple_pos', 'vec'):
return 'error: maxSimTxt?simType?????: {}'.format(simType)
# ??????????????? simple_pos ??
embedding = self.vecModel
if simType == 'vec' and not embedding:
simType = 'simple_pos'
for t in self.zhishiku:
questions = t.q_vec if simType == 'vec' else t.q_word
in_vec = jieba.lcut(intxt) if simType == 'simple' else pseg.lcut(intxt)
t.sim = max(
similarity(in_vec, question, method=simType, embedding=embedding)
for question in questions
)
maxSim = max(self.zhishiku, key=lambda x: x.sim)
logger.info('maxSim=' + format(maxSim.sim, '.0%'))
if maxSim.sim < simCondision:
return '?????????????????????????'
return maxSim.a
开发者ID:ofooo,项目名称:FAQrobot,代码行数:31,
示例7: __init__
点赞 2
# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import lcut [as 别名]
def __init__(self, rtepair, stop=True, lemmatize=False):
"""
:param rtepair: a ``RTEPair`` from which features should be extracted, (txt, hyp)
:param stop: if ``True``, stopwords are thrown away.
:type stop: bool
"""
global stop_word_path
self.stop = stop
self.stopwords = codecs.open(stop_word_path + 'stopwords.txt', encoding='UTF-8').read()
self.negwords = set([u"?", u"??", u"??", u"?", u"??", u"??", u"??", u"??", u"??"])
text_words = pseg.lcut(rtepair[0])
hyp_words = pseg.lcut(rtepair[1])
self.text_words = set()
self.hyp_words = set()
# ??????????????
pass
# ?? wordnet ????????
if lemmatize:
pass
# ????
for word, flag in text_words:
if word not in self.stopwords:
self.text_words.add((word, flag))
for word, flag in hyp_words:
if word not in self.stopwords:
self.hyp_words.add((word, flag))
# ????
self._overlap = self.hyp_words & self.text_words # hyp ? text??
self._hyp_extra = self.hyp_words - self.text_words # hyp? text??
self._txt_extra = self.text_words - self.hyp_words # text? hyp??
开发者ID:wac81,项目名称:Book_DeepLearning_Practice,代码行数:38,
示例8: delstopwords
点赞 2
# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import lcut [as 别名]
def delstopwords(content):
result = ''
words = pseg.lcut("".join(content.split()))
for word, flag in words:
if word not in stopwords and flag not in ["/x", "/zg", "/uj", "/ul", "/e", "/d", "/uz",
"/y"]: # ??????????????????
result += word.encode('utf-8') # +"/"+str(w.flag)+" " #????
return result
开发者ID:wac81,项目名称:recommended_system,代码行数:10,
示例9: prefix_process
点赞 2
# 需要导入模块: from jieba import posseg [as 别名]
# 或者: from jieba.posseg import lcut [as 别名]
def prefix_process(curr_index, sentence, score):
"""
?????????????
:param curr_index: w ? sentence ??????
:param score: ??????
:param sentence: ??
:return:
"""
num_cnt = 5
if curr_index - num_cnt > 0:
seg = sentence[curr_index - num_cnt:curr_index]
else:
seg = sentence[0:curr_index]
# ????????
for curr_neg_prefix in double_none_prefix:
if seg.endswith(curr_neg_prefix):
return 0.8 * score
# ????????
for curr_neg_prefix in set_neg_prefix:
if seg.endswith(curr_neg_prefix):
temp_pair = pseg.lcut(sentence[0:curr_index])
for i, (w, f) in enumerate(reversed(temp_pair)):
if f.startswith(u"x"):
break
elif f.startswith(u"r") or f.startswith(u"n") or f.startswith(u"m"):
if (len(temp_pair)-i-2) > 0 and temp_pair[len(temp_pair)-i-2].word in set_neg_prefix:
return 1.3 * score
return -1.3 * score
temp_pair = pseg.lcut(seg)
for i, (w, f) in enumerate(reversed(temp_pair)):
if f.startswith(u"x"):
break
elif f.startswith(u"r") or f.startswith(u"n") or f.startswith(u"m"):
if temp_pair[len(temp_pair)-i-2].word in set_neg_prefix:
return -0.6 * score
# ?????????????
for curr_very_prefix in set_very_prefix:
if seg.endswith(curr_very_prefix):
return 1.3 * score
return score
开发者ID:wac81,项目名称:Book_DeepLearning_Practice,代码行数:46,
注:本文中的jieba.posseg.lcut方法示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。