当前位置: 动力学知识库 > 问答 > 编程问答 >

python - nltk pre-processing "stop words/urls"program something wrong in the code

问题描述:

I am doing a Sentiment Analysis for Twitter, and the program almost done but i am facing some problems on the pre-processing, i am trying to filter the documents given, first by removing the "stop words" then remove the tags and urls from the phrase. Can someone tell me in what concern the filtration of stop words and the tags/urls is it correct does it really removing and filtrating any given phrase this is my whole program:

import nltk

import pickle

from sklearn.naive_bayes import MultinomialNB, BernoulliNB

from nltk.classify import ClassifierI

from statistics import mode

from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords

class VoteClassifier(ClassifierI):

def __init__(self, *classifiers):

self._classifiers = classifiers

def classify(self, features):

votes = []

for c in self._classifiers:

v = c.classify(features)

votes.append(v)

return mode(votes)

def confidence(self, features):

votes = []

for c in self._classifiers:

v = c.classify(features)

votes.append(v)

choice_votes = votes.count(mode(votes))

conf = choice_votes / len(votes)

return conf

short_pos = open("short_reviews/positive.txt","r").read()

short_neg = open("short_reviews/negative.txt","r").read()

all_words = []

documents = []

allowed_word_types = ["J"]

for p in short_pos.split('\n'):

documents.append( (p, "pos") )

words = word_tokenize(p)

pos = nltk.pos_tag(words)

for w in pos:

if w[1][0] in allowed_word_types:

all_words.append(w[0].lower())

for p in short_neg.split('\n'):

documents.append( (p, "neg") )

words = word_tokenize(p)

pos = nltk.pos_tag(words)

for w in pos:

if w[1][0] in allowed_word_types:

all_words.append(w[0].lower())

stop_words = set(stopwords.words("english"))

filtered_sentence = []

for w in all_words:

if w not in stop_words:

filtered_sentence.append(w)

print (filtered_sentence)

filtered_documents = open("pickled_algos/fil_documents.pickle","wb")

pickle.dump(filtered_sentence, filtered_documents)

filtered_documents.close()

#remove the words that start with #

filter(lambda x:x[0]!='#', documents.split())

#remove URL

filter(lambda x:x[0]!='https://www.', documents.split())

save_documents = open("pickled_algos/documents.pickle","wb")

pickle.dump(documents, save_documents)

save_documents.close()

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:5000]

save_word_features = open("pickled_algos/word_features5k.pickle","wb")

pickle.dump(word_features, save_word_features)

save_word_features.close()

def find_features(document):

words = word_tokenize(document)

features = {}

for w in word_features:

features[w] = (w in words)

return features

featuresets = [(find_features(rev), category) for (rev, category) in documents]

save_featuresets = open("pickled_algos/featuresets.pickle", "wb")

pickle.dump(featuresets, save_featuresets)

save_featuresets.close()

random.shuffle(featuresets)

testing_set = featuresets[10000:]

training_set = featuresets[:10000]

classifier = nltk.NaiveBayesClassifier.train(training_set)

print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)

classifier.show_most_informative_features(15)

def realsenti(text):

feats = find_features(text)

return voted_classifier.classify(feats),text

and this the part that i want to see if it's really doing the job correct

stop_words = set(stopwords.words("english"))

filtered_sentence = []

for w in all_words:

if w not in stop_words:

filtered_sentence.append(w)

print (filtered_sentence)

filtered_documents = open("pickled_algos/fil_documents.pickle","wb")

pickle.dump(filtered_sentence, filtered_documents)

filtered_documents.close()

#remove the words that start with #

filter(lambda x:x[0]!='#', documents.split())

#remove URL

filter(lambda x:x[0]!='https://www.', documents.split())

since i am getting this error everytime

filter(lambda x:x[0]!='#', documents.split())

AttributeError: 'list' object has no attribute 'split'

i hope that everything is clarify and i hope some one will instruct me more from what i did wrong thanks in advance.

分享给朋友:
您可能感兴趣的文章:
随机阅读: