80s網(wǎng)站建設(shè)工作室鄭州網(wǎng)站建設(shè)七彩科技
KNN-手寫數(shù)字?jǐn)?shù)據(jù)集:
? ? ? 使用sklearn中的KNN算法工具包( KNeighborsClassifier)替換實(shí)現(xiàn)分類器的構(gòu)建,注意使用的是漢明距離;
?運(yùn)行結(jié)果:(大概要運(yùn)行4分鐘左右)
代碼:
import pandas as pd
import osdef hamming(str1, str2):if len(str1) != len(str2):raise ValueError("兩個(gè)字符串長(zhǎng)度不相等")return sum(c1 != c2 for c1, c2 in zip(str1, str2))def get_train():path = 'digits/trainingDigits'trainingFileList0 = os.listdir(path)trainingFileList = [file[2:] if file.startswith('._') else file for file in trainingFileList0]train = pd.DataFrame()img = []labels = []for i in range(len(trainingFileList)):filename = trainingFileList[i]with open(f'digits/trainingDigits/{filename}', 'r') as f:txt = f.read().replace('\n', '')img.append(txt)filelabel = filename.split('_')[0]labels.append(filelabel)train['img'] = imgtrain['labels'] = labelsreturn traindef get_test():path = 'digits/testDigits'testFileList0 = os.listdir(path)testFileList = [file[2:] if file.startswith('._') else file for file in testFileList0]test = pd.DataFrame()img = []labels = []for filename in testFileList:with open(f'digits/testDigits/{filename}', 'r') as f:txt = f.read().replace('\n', '')img.append(txt)filelabel = filename.split('_')[0]labels.append(filelabel)test['img'] = imgtest['labels'] = labelsreturn testdef handwritingClass(train, test, k):n = train.shape[0]m = test.shape[0]result = []for i in range(m):dist = []for j in range(n):d = str(hamming(train.iloc[j, 0], test.iloc[i, 0]))dist.append(d)dist_l = pd.DataFrame({'dist': dist, 'labels': train.iloc[:, 1]})dr = dist_l.sort_values(by='dist')[:k]re = dr.loc[:, 'labels'].value_counts()result.append(re.index[0])result = pd.Series(result)test['predict'] = resultacc = (test.iloc[:, -1] == test.iloc[:, -2]).mean()print(f'模型預(yù)測(cè)準(zhǔn)確率為{acc:.5f}')return test# 獲取訓(xùn)練集和測(cè)試集
train = get_train()
test = get_test()# 調(diào)用函數(shù)
handwritingClass(train, test, 3)