python中scikit-learn机器代码实例
Python  /  管理员 发布于 6年前   115
我们给大家带来了关于学习python中scikit-learn机器代码的相关具体实例,以下就是全部代码内容:
# -*- coding: utf-8 -*- import numpyfrom sklearn import metricsfrom sklearn.svm import LinearSVCfrom sklearn.naive_bayes import MultinomialNBfrom sklearn import linear_modelfrom sklearn.datasets import load_irisfrom sklearn.cross_validation import train_test_splitfrom sklearn.preprocessing import OneHotEncoder, StandardScalerfrom sklearn import cross_validationfrom sklearn import preprocessing#import iris_data def load_data(): iris = load_iris() x, y = iris.data, iris.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42) return x_train,y_train,x_test,y_test def train_clf3(train_data, train_tags): clf = LinearSVC(C=1100.0)#default with 'rbf' clf.fit(train_data,train_tags) return clf def train_clf(train_data, train_tags): clf = MultinomialNB(alpha=0.01) print numpy.asarray(train_tags) clf.fit(train_data, numpy.asarray(train_tags)) return clf def evaluate(actual, pred): m_precision = metrics.precision_score(actual, pred) m_recall = metrics.recall_score(actual, pred) print 'precision:{0:.3f}'.format(m_precision) print 'recall:{0:0.3f}'.format(m_recall) print 'f1-score:{0:.8f}'.format(metrics.f1_score(actual,pred)); x_train,y_train,x_test,y_test = load_data() clf = train_clf(x_train, y_train) pred = clf.predict(x_test)evaluate(numpy.asarray(y_test), pred)print metrics.classification_report(y_test, pred) 使用自定义数据# coding: utf-8 import numpyfrom sklearn import metricsfrom sklearn.feature_extraction.text import HashingVectorizerfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.naive_bayes import MultinomialNBfrom sklearn.feature_extraction.text import CountVectorizer,TfidfTransformerfrom sklearn.neighbors import KNeighborsClassifierfrom sklearn.svm import SVCfrom sklearn.svm import LinearSVCimport codecsfrom sklearn.ensemble import RandomForestClassifierfrom sklearn import cross_validationfrom sklearn import linear_model train_corpus = [ '我们 我们 好孩子 认证 。 就是', '我们 好孩子 认证 。 中国', '我们 好孩子 认证 。 孤独', '我们 好孩子 认证 。', ] test_corpus = [ '我 菲律宾 韩国', '我们 好孩子 认证 。 中国', ] def input_data(train_file, test_file): train_words = [] train_tags = [] test_words = [] test_tags = [] f1 = codecs.open(train_file,'r','utf-8','ignore') for line in f1: tks = line.split(':', 1) word_list = tks[1] word_array = word_list[1:(len(word_list)-3)].split(", ") train_words.append(" ".join(word_array)) train_tags.append(tks[0]) f2 = codecs.open(test_file,'r','utf-8','ignore') for line in f2: tks = line.split(':', 1) word_list = tks[1] word_array = word_list[1:(len(word_list)-3)].split(", ") test_words.append(" ".join(word_array)) test_tags.append(tks[0]) return train_words, train_tags, test_words, test_tags def vectorize(train_words, test_words): #v = HashingVectorizer(n_features=25000, non_negative=True) v = HashingVectorizer(non_negative=True) #v = CountVectorizer(min_df=1) train_data = v.fit_transform(train_words) test_data = v.fit_transform(test_words) return train_data, test_data def vectorize1(train_words, test_words): tv = TfidfVectorizer(sublinear_tf = False,use_idf=True); train_data = tv.fit_transform(train_words); tv2 = TfidfVectorizer(vocabulary = tv.vocabulary_); test_data = tv2.fit_transform(test_words); return train_data, test_data def vectorize2(train_words, test_words): count_v1= CountVectorizer(stop_words = 'english', max_df = 0.5); counts_train = count_v1.fit_transform(train_words); count_v2 = CountVectorizer(vocabulary=count_v1.vocabulary_); counts_test = count_v2.fit_transform(test_words); tfidftransformer = TfidfTransformer(); train_data = tfidftransformer.fit(counts_train).transform(counts_train); test_data = tfidftransformer.fit(counts_test).transform(counts_test); return train_data, test_data def evaluate(actual, pred): m_precision = metrics.precision_score(actual, pred) m_recall = metrics.recall_score(actual, pred) print 'precision:{0:.3f}'.format(m_precision) print 'recall:{0:0.3f}'.format(m_recall) print 'f1-score:{0:.8f}'.format(metrics.f1_score(actual,pred)); def train_clf(train_data, train_tags): clf = MultinomialNB(alpha=0.01) clf.fit(train_data, numpy.asarray(train_tags)) return clf def train_clf1(train_data, train_tags): #KNN Classifier clf = KNeighborsClassifier()#default with k=5 clf.fit(train_data, numpy.asarray(train_tags)) return clf def train_clf2(train_data, train_tags): clf = linear_model.LogisticRegression(C=1e5) clf.fit(train_data,train_tags) return clf def train_clf3(train_data, train_tags): clf = LinearSVC(C=1100.0)#default with 'rbf' clf.fit(train_data,train_tags) return clf def train_clf4(train_data, train_tags): """ 随机森林,不可使用稀疏矩阵 """ clf = RandomForestClassifier(n_estimators=10) clf.fit(train_data.todense(),train_tags) return clf #使用codecs逐行读取def codecs_read_label_line(filename): label_list=[] f = codecs.open(filename,'r','utf-8','ignore') line = f.readline() while line: #label_list.append(line[0:len(line)-2]) label_list.append(line[0:len(line)-1]) line = f.readline() f.close() return label_list def save_test_features(test_url, test_label): test_feature_list = codecs_read_label_line('test.dat') fw = open('test_labeded.dat',"w+") for (url,label) in zip(test_feature_list,test_label): fw.write(url+'\t'+label) fw.write('\n') fw.close() def main(): train_file = u'..\\file\\py_train.txt' test_file = u'..\\file\\py_test.txt' train_words, train_tags, test_words, test_tags = input_data(train_file, test_file) #print len(train_words), len(train_tags), len(test_words), len(test_words), train_data, test_data = vectorize1(train_words, test_words) print type(train_data) print train_data.shape print test_data.shape print test_data[0].shape print numpy.asarray(test_data[0]) clf = train_clf3(train_data, train_tags) scores = cross_validation.cross_val_score( clf, train_data, train_tags, cv=5, scoring="f1_weighted") print scores #predicted = cross_validation.cross_val_predict(clf, train_data,train_tags, cv=5) ''' ''' pred = clf.predict(test_data) error_list=[] for (true_tag,predict_tag) in zip(test_tags,pred): if true_tag != predict_tag: print true_tag,predict_tag error_list.append(true_tag+' '+predict_tag) print len(error_list) evaluate(numpy.asarray(test_tags), pred) ''' #输出打标签结果 test_feature_list = codecs_read_label_line('test.dat') save_test_features(test_feature_list, pred) ''' if __name__ == '__main__': main()
123 在
Clash for Windows作者删库跑路了,github已404中评论 按理说只要你在国内,所有的流量进出都在监控范围内,不管你怎么隐藏也没用,想搞你分..原梓番博客 在
在Laravel框架中使用模型Model分表最简单的方法中评论 好久好久都没看友情链接申请了,今天刚看,已经添加。..博主 在
佛跳墙vpn软件不会用?上不了网?佛跳墙vpn常见问题以及解决办法中评论 @1111老铁这个不行了,可以看看近期评论的其他文章..1111 在
佛跳墙vpn软件不会用?上不了网?佛跳墙vpn常见问题以及解决办法中评论 网站不能打开,博主百忙中能否发个APP下载链接,佛跳墙或极光..路人 在
php中使用hyperf框架调用讯飞星火大模型实现国内版chatgpt功能示例中评论 教程很详细,如果加个前端chatgpt对话页面就完美了..Copyright·© 2019 侯体宗版权所有· 粤ICP备20027696号