简易中文分词聚类(Python)

来源:转载


[var1]
# -*- coding: utf-8 -*-
__author__ = 'Zhao'
import re
import operator
blank = [chr(183)]
tabs = ['']
def tree(lst):
l = len(lst)
if l == 0:
print('─' * 3)
else:
for i, j in enumerate(lst):
if i != 0: print(tabs[0], end='')
if l == 1:
s = '─' * 3
elif i == 0:
s = '┬' + '─' * 2
elif i + 1 == l:
s = '└' + '─' * 2
else:
s = '├' + '─' * 2
print(s, end='')
if isinstance(j, list) or isinstance(j, tuple):
if i + 1 == l:
tabs[0] += blank[0] * 3
else:
tabs[0] += '│' + blank[0] * 2
tree(j)
else:
print(" ", j)
tabs[0] = tabs[0][:-3]
def judge_element_delete(list_input, centroid, group, match_num):
for list_element in list_input:
if isinstance(list_element, list):
for element in list_element:
if element == match_num:
del centroid[list_input.index(list_element)]
del group[list_input.index(list_element)]
else:
if list_element == match_num:
del centroid[list_input.index(list_element)]
del group[list_input.index(list_element)]
# --------------- in this part we save the list as list ---------------
path = '/Users/apple/desktop/'
fp = open(path + 'list.txt')
ori = fp.readlines()
# ori is the list with out any operation
copy = []
for x in ori:
x = re.sub(r'/n', '', x)
copy.append(x)
# in this part we change the format in a into standard format and save as copy
fp.close()
# we close the file, then we can run the list totally in this program
copy.sort()
# --------------- this part end ---------------
# in this part we know the average length in this list is 2, thus we set step as 5.
# In that case, we can contain at least one word.
# totally, there are 56064 words in this list and only 56 is longer than 5.
# In that case, 5 can be a reasonable step for this program.
# sum = 0
# num = 0
# for x in copy:
# sum += len(x)
# num += 1
# average = (int)(sum/num)
# print(average, ' ', num);
# max_lenth = 0
# for x in copy:
# if max_lenth < len(x):
#max_lenth = len(x)
#
# print(max_lenth)
# number = 0
# for x in copy:
# if len(x) > 5:
#number += 1
#
# print(number)
# --------------- the upper is the calculation in the preparation ---------------
str_input = input("请输入一个段落:/n")
str_input = re.sub(r',', "", str_input)
str_input = re.sub(r',', "", str_input)
str_input = re.sub(r'/.', "", str_input)
str_input = re.sub(r'。', "", str_input)
str_input = re.sub(r'——', "", str_input)
str_input = re.sub(r'……', "", str_input)
str_input = re.sub(r'!', "", str_input)
str_input = re.sub(r'!', "", str_input)
str_input = re.sub(r'/?', "", str_input)
str_input = re.sub(r'?', "", str_input)
str_input = re.sub(r';', "", str_input)
str_input = re.sub(r';', "", str_input)
str_input = re.sub(r' ', "", str_input)
# change all the punctuation as blank, however, we may split falsely.
# Words get around, the step can also split at wrong place, so, I do not fix this mistake.
str_head = 0
str_tail = len(str_input)
ptr = 5
temp = 0
step = 5
result = []
ch_index = []
while temp < str_tail-1:
flag = 0
ptr = 5
while flag != 1:
in_put = str_input[temp:temp + ptr]
tail = len(copy)
head = 0
half = int((tail + head) / 2)
while tail != half and head != half:
if operator.lt(copy[half], in_put):
# 如果字符组的一半比input小
head = half
half = int((tail + head) / 2)
elif operator.gt(copy[half], in_put):
# 如果字符组的一半比input大
tail = half
half = int((tail + head) / 2)
else:
# print(in_put, end='/')
result.append(in_put)
ch_index.append(half)
flag = 1
temp += len(in_put)
break
if ptr == 0 and temp <= len(str_input)-1:
# print(str_input[temp], end='/')
result.append(str_input[temp])
ch_index.append(-1)
temp += 1
flag = 1
if flag == 0:
ptr -= 1
group = result
centroid = ch_index
# group = input("Please input some numbers spit as blank:/n").split(" ")
# group_num = len(group)
# for element in group:
# centroid.append(int(element))
precision = 0
for element in group:
precision = len(element) if len(element) > precision else precision
group_num = len(group)
while group_num != 2:
# print("the numbers of groups now is ", group_num, "/n")
matrix = [[] for i in range(group_num)]
for i in range(group_num):
for j in range(group_num):
distance = abs(int(centroid[i]) - int(centroid[j]))
matrix[i].append(distance)
# --------------- matrix ---------------
# print("distance matrix :")
# for i in range(group_num):
# print(matrix[i])
# matrix contains the distance between every two elements
# print("------------")
max_in_matrix = 0
for i in range(group_num):
for j in range(group_num):
max_in_matrix = max_in_matrix if max_in_matrix > matrix[i][j] else matrix[i][j]
# print(max_in_matrix)
# if max_in_matrix == 0:
# break
for i in range(group_num):
for j in range(group_num):
matrix[i][j] /= max_in_matrix
matrix[i][j] = round(1 - matrix[i][j], precision) if round(1 - matrix[i][j], precision) != 1 else 0
# print("standard matrix :")
# for i in range(group_num):
# print(matrix[i])
# print("------------")
# standard the matrix
similarity = 0
for i in range(group_num):
for j in range(group_num):
similarity = similarity if similarity > matrix[i][j] else matrix[i][j]
# print("max similarity in the matrix: ", max_in_matrix, "/n")
# --------------- matrix ---------------
# find the max similarity in this matrix
temp_class = []
index = []
flag = 0
for i in range(group_num):
for j in range(group_num):
if matrix[i][j] == similarity:
index.append(i)
index.append(j)
flag = 1
temp_class.append(group[i])
temp_class.append(group[j])
if flag == 1:
break
if flag == 1:
break
# find the first center index of new group
group_num = len(group)
for i in range(group_num):
if matrix[index[0]][i] == similarity and i != index[1]:
temp_class.append(group[i])
index.append(i)
for i in range(group_num):
if matrix[index[1]][i] == similarity and i != index[0]:
temp_class.append(group[i])
index.append(i)
new_centroid = 0
for element in index:
new_centroid += centroid[element]
new_centroid /= len(index)
for element in index:
group[element] = 'substitute'
centroid[element] = 'substitute'
lenth = len(group)
temp_flag = 0
while temp_flag != 1:
temp_flag = 1
for i in range(0, lenth):
if group[i] == 'substitute':
del group[i]
lenth = len(group)
temp_flag = 0
break
lenth = len(centroid)
temp_flag = 0
while temp_flag != 1:
temp_flag = 1
for i in range(0, lenth):
if centroid[i] == 'substitute':
del centroid[i]
lenth = len(centroid)
temp_flag = 0
break
group.append(temp_class)
centroid.append(new_centroid)
group_num = len(group)
print(group)
tree(group)

分享给朋友:
您可能感兴趣的文章:
随机阅读: