我们给大家带来了关于学习python中scikit-learn机器代码的相关具体实例,以下就是全部代码内容:
# -*- coding: utf-8 -*- import numpy from sklearn import metrics from sklearn.svm import LinearSVC from sklearn.naive_bayes import MultinomialNB from sklearn import linear_model from sklearn.datasets import load_iris from sklearn.cross_validation import train_test_split from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn import cross_validation from sklearn import preprocessing #import iris_data def load_data(): iris = load_iris() x, y = iris.data, iris.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42) return x_train,y_train,x_test,y_test def train_clf3(train_data, train_tags): clf = LinearSVC(C=1100.0)#default with 'rbf' clf.fit(train_data,train_tags) return clf def train_clf(train_data, train_tags): clf = MultinomialNB(alpha=0.01) print numpy.asarray(train_tags) clf.fit(train_data, numpy.asarray(train_tags)) return clf def evaluate(actual, pred): m_precision = metrics.precision_score(actual, pred) m_recall = metrics.recall_score(actual, pred) print 'precision:{0:.3f}'.format(m_precision) print 'recall:{0:0.3f}'.format(m_recall) print 'f1-score:{0:.8f}'.format(metrics.f1_score(actual,pred)); x_train,y_train,x_test,y_test = load_data() clf = train_clf(x_train, y_train) pred = clf.predict(x_test) evaluate(numpy.asarray(y_test), pred) print metrics.classification_report(y_test, pred) 使用自定义数据 # coding: utf-8 import numpy from sklearn import metrics from sklearn.feature_extraction.text import HashingVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.svm import LinearSVC import codecs from sklearn.ensemble import RandomForestClassifier from sklearn import cross_validation from sklearn import linear_model train_corpus = [ '我们 我们 好孩子 认证 。 就是', '我们 好孩子 认证 。 中国', '我们 好孩子 认证 。 孤独', '我们 好孩子 认证 。', ] test_corpus = [ '我 菲律宾 韩国', '我们 好孩子 认证 。 中国', ] def input_data(train_file, test_file): train_words = [] train_tags = [] test_words = [] test_tags = [] f1 = codecs.open(train_file,'r','utf-8','ignore') for line in f1: tks = line.split(':', 1) word_list = tks[1] word_array = word_list[1:(len(word_list)-3)].split(", ") train_words.append(" ".join(word_array)) train_tags.append(tks[0]) f2 = codecs.open(test_file,'r','utf-8','ignore') for line in f2: tks = line.split(':', 1) word_list = tks[1] word_array = word_list[1:(len(word_list)-3)].split(", ") test_words.append(" ".join(word_array)) test_tags.append(tks[0]) return train_words, train_tags, test_words, test_tags def vectorize(train_words, test_words): #v = HashingVectorizer(n_features=25000, non_negative=True) v = HashingVectorizer(non_negative=True) #v = CountVectorizer(min_df=1) train_data = v.fit_transform(train_words) test_data = v.fit_transform(test_words) return train_data, test_data def vectorize1(train_words, test_words): tv = TfidfVectorizer(sublinear_tf = False,use_idf=True); train_data = tv.fit_transform(train_words); tv2 = TfidfVectorizer(vocabulary = tv.vocabulary_); test_data = tv2.fit_transform(test_words); return train_data, test_data def vectorize2(train_words, test_words): count_v1= CountVectorizer(stop_words = 'english', max_df = 0.5); counts_train = count_v1.fit_transform(train_words); count_v2 = CountVectorizer(vocabulary=count_v1.vocabulary_); counts_test = count_v2.fit_transform(test_words); tfidftransformer = TfidfTransformer(); train_data = tfidftransformer.fit(counts_train).transform(counts_train); test_data = tfidftransformer.fit(counts_test).transform(counts_test); return train_data, test_data def evaluate(actual, pred): m_precision = metrics.precision_score(actual, pred) m_recall = metrics.recall_score(actual, pred) print 'precision:{0:.3f}'.format(m_precision) print 'recall:{0:0.3f}'.format(m_recall) print 'f1-score:{0:.8f}'.format(metrics.f1_score(actual,pred)); def train_clf(train_data, train_tags): clf = MultinomialNB(alpha=0.01) clf.fit(train_data, numpy.asarray(train_tags)) return clf def train_clf1(train_data, train_tags): #KNN Classifier clf = KNeighborsClassifier()#default with k=5 clf.fit(train_data, numpy.asarray(train_tags)) return clf def train_clf2(train_data, train_tags): clf = linear_model.LogisticRegression(C=1e5) clf.fit(train_data,train_tags) return clf def train_clf3(train_data, train_tags): clf = LinearSVC(C=1100.0)#default with 'rbf' clf.fit(train_data,train_tags) return clf def train_clf4(train_data, train_tags): """ 随机森林,不可使用稀疏矩阵 """ clf = RandomForestClassifier(n_estimators=10) clf.fit(train_data.todense(),train_tags) return clf #使用codecs逐行读取 def codecs_read_label_line(filename): label_list=[] f = codecs.open(filename,'r','utf-8','ignore') line = f.readline() while line: #label_list.append(line[0:len(line)-2]) label_list.append(line[0:len(line)-1]) line = f.readline() f.close() return label_list def save_test_features(test_url, test_label): test_feature_list = codecs_read_label_line('test.dat') fw = open('test_labeded.dat',"w+") for (url,label) in zip(test_feature_list,test_label): fw.write(url+'\t'+label) fw.write('\n') fw.close() def main(): train_file = u'..\\file\\py_train.txt' test_file = u'..\\file\\py_test.txt' train_words, train_tags, test_words, test_tags = input_data(train_file, test_file) #print len(train_words), len(train_tags), len(test_words), len(test_words), train_data, test_data = vectorize1(train_words, test_words) print type(train_data) print train_data.shape print test_data.shape print test_data[0].shape print numpy.asarray(test_data[0]) clf = train_clf3(train_data, train_tags) scores = cross_validation.cross_val_score( clf, train_data, train_tags, cv=5, scoring="f1_weighted") print scores #predicted = cross_validation.cross_val_predict(clf, train_data,train_tags, cv=5) ''' ''' pred = clf.predict(test_data) error_list=[] for (true_tag,predict_tag) in zip(test_tags,pred): if true_tag != predict_tag: print true_tag,predict_tag error_list.append(true_tag+' '+predict_tag) print len(error_list) evaluate(numpy.asarray(test_tags), pred) ''' #输出打标签结果 test_feature_list = codecs_read_label_line('test.dat') save_test_features(test_feature_list, pred) ''' if __name__ == '__main__': main()
广告合作:本站广告合作请联系QQ:858582 申请时备注:广告合作(否则不回)
免责声明:本站资源来自互联网收集,仅供用于学习和交流,请遵循相关法律法规,本站一切资源不代表本站立场,如有侵权、后门、不妥请联系本站删除!
免责声明:本站资源来自互联网收集,仅供用于学习和交流,请遵循相关法律法规,本站一切资源不代表本站立场,如有侵权、后门、不妥请联系本站删除!
暂无评论...
稳了!魔兽国服回归的3条重磅消息!官宣时间再确认!
昨天有一位朋友在大神群里分享,自己亚服账号被封号之后居然弹出了国服的封号信息对话框。
这里面让他访问的是一个国服的战网网址,com.cn和后面的zh都非常明白地表明这就是国服战网。
而他在复制这个网址并且进行登录之后,确实是网易的网址,也就是我们熟悉的停服之后国服发布的暴雪游戏产品运营到期开放退款的说明。这是一件比较奇怪的事情,因为以前都没有出现这样的情况,现在突然提示跳转到国服战网的网址,是不是说明了简体中文客户端已经开始进行更新了呢?
更新日志
2024年11月26日
2024年11月26日
- 凤飞飞《我们的主题曲》飞跃制作[正版原抓WAV+CUE]
- 刘嘉亮《亮情歌2》[WAV+CUE][1G]
- 红馆40·谭咏麟《歌者恋歌浓情30年演唱会》3CD[低速原抓WAV+CUE][1.8G]
- 刘纬武《睡眠宝宝竖琴童谣 吉卜力工作室 白噪音安抚》[320K/MP3][193.25MB]
- 【轻音乐】曼托凡尼乐团《精选辑》2CD.1998[FLAC+CUE整轨]
- 邝美云《心中有爱》1989年香港DMIJP版1MTO东芝首版[WAV+CUE]
- 群星《情叹-发烧女声DSD》天籁女声发烧碟[WAV+CUE]
- 刘纬武《睡眠宝宝竖琴童谣 吉卜力工作室 白噪音安抚》[FLAC/分轨][748.03MB]
- 理想混蛋《Origin Sessions》[320K/MP3][37.47MB]
- 公馆青少年《我其实一点都不酷》[320K/MP3][78.78MB]
- 群星《情叹-发烧男声DSD》最值得珍藏的完美男声[WAV+CUE]
- 群星《国韵飘香·贵妃醉酒HQCD黑胶王》2CD[WAV]
- 卫兰《DAUGHTER》【低速原抓WAV+CUE】
- 公馆青少年《我其实一点都不酷》[FLAC/分轨][398.22MB]
- ZWEI《迟暮的花 (Explicit)》[320K/MP3][57.16MB]