摘要:由于近期學(xué)業(yè)繁重,所以我就不說廢話了,直接上代碼簡單的決策樹示例運行效果代碼定義文本框和箭頭格式畫樹使用文本注解繪制樹節(jié)點繪制帶箭頭的注解在父子節(jié)點間填充文本信息創(chuàng)建數(shù)據(jù)集計算給定數(shù)據(jù)的香農(nóng)熵熵值越高,混合的數(shù)據(jù)越多,越無序我們可
由于近期學(xué)業(yè)繁重QAQ,所以我就不說廢話了,直接上代碼~
簡單的決策樹示例 運行效果from math import log import operator import matplotlib.pyplot as plt #定義文本框和箭頭格式 decisionNode=dict(boxstyle="sawtooth",fc="0.8") leafNode=dict(boxstyle="round4",fc="0.8") arrow_args=dict(arrowstyle="<-") #畫樹 #使用文本注解繪制樹節(jié)點 #繪制帶箭頭的注解 def plotNode(nodeTxt,centerPt,parentPt,nodeType): createPlot.ax1.annotate(nodeTxt,xy=parentPt, xycoords="axes fraction", xytext=centerPt,textcoords="axes fraction", va="center",ha="center",bbox=nodeType, arrowprops=arrow_args) #在父子節(jié)點間填充文本信息 def plotMidText(cntrPt,parentPt,txtString): xMid=(parentPt[0]-cntrPt[0])/2.0+cntrPt[0] yMid=(parentPt[1]-cntrPt[1])/2.0+cntrPt[1] createPlot.ax1.text(xMid,yMid,txtString) def plotTree(myTree,parentPt,nodeTxt): numLeafs=getNumLeafs(myTree) depth=getTreeDepth(myTree) firstStr=list(myTree.keys())[0] cntrPt=(plotTree.xOff+(1.0+float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff) plotMidText(cntrPt,parentPt,nodeTxt) plotNode(firstStr,cntrPt,parentPt,decisionNode) secondDict=myTree[firstStr] plotTree.yOff=plotTree.yOff-1.0/plotTree.totalD for key in secondDict.keys(): if type(secondDict[key]).__name__=="dict": plotTree(secondDict[key],cntrPt,str(key)) else: plotTree.xOff=plotTree.xOff+1.0/plotTree.totalW plotNode(secondDict[key],(plotTree.xOff,plotTree.yOff), cntrPt,leafNode) plotMidText((plotTree.xOff,plotTree.yOff), cntrPt,str(key)) plotTree.yOff=plotTree.yOff+1.0/plotTree.totalD def createPlot(inTree): fig=plt.figure(1,facecolor="white") fig.clf() axprops=dict(xticks=[],yticks=[]) createPlot.ax1=plt.subplot(111,frameon=False,**axprops) plotTree.totalW=float(getNumLeafs(inTree)) plotTree.totalD=float(getNumLeafs(inTree)) plotTree.xOff=-0.5/plotTree.totalW;plotTree.yOff=1.0; plotTree(inTree,(0.5,1.0),"") plt.show() #創(chuàng)建數(shù)據(jù)集 def createDataSet(): dataSet=[[1,1,"yes"], [1,1,"yes"], [1,0,"no"], [0,1,"no"], [0,1,"no"]] labels=["no surfacing","flippers"] return dataSet,labels #計算給定數(shù)據(jù)的香農(nóng)熵 #熵值越高,混合的數(shù)據(jù)越多,越無序 #我們可以在數(shù)據(jù)集中添加更多的分類 def calcShannonEnt(dataSet): numEntries=len(dataSet) #數(shù)據(jù)字典,鍵值為最后一列的數(shù)值"yes"or"no" labelCounts={} for featVec in dataSet: #為所有可能分類創(chuàng)建字典 #"yes"or"no" currentLabel=featVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel]=0 labelCounts[currentLabel]+=1 shannonEnt=0.0 for key in labelCounts: prob=float(labelCounts[key])/numEntries #以2為?求對數(shù) shannonEnt-=prob*log(prob,2) return shannonEnt #按照給定特征劃分?jǐn)?shù)據(jù)集 #輸入的參數(shù)為:待劃分的數(shù)據(jù)集, #劃分?jǐn)?shù)據(jù)集的特征(第幾列), #特征的返回值(這一列的值為多少) #返回的是符合這一列的值的每一行, #并且將這一列的數(shù)據(jù)去掉了 def splitDataSet(dataSet,axis,value): retDataSet=[] #遍歷整個數(shù)據(jù)集 #featVec:[1, 1, "yes"] for featVec in dataSet: #print("featVec:") #print(featVec) #抽取其中符合特征的 #featVec[axis]表示[1, 1, "yes"]中的第axis+1個 if featVec[axis]==value: #保存這一列前面的數(shù)據(jù) reducedFeatVec=featVec[:axis] #print("reducedFeatVec:") #print(reducedFeatVec) #保存這一列后面的數(shù)據(jù) reducedFeatVec.extend(featVec[axis+1:]) #print("reducedFeatVec:") #print(reducedFeatVec) retDataSet.append(reducedFeatVec) #print("retDataSet:") #print(retDataSet) return retDataSet #選擇最好的數(shù)據(jù)集劃分方式 def chooseBestFeatureToSplit(dataSet): #numFeatures:2 numFeatures=len(dataSet[0])-1 #計算香農(nóng)熵 baseEntropy=calcShannonEnt(dataSet) bestInfoGain=0.0 bestFeature=-1 #i:0,1 for i in range(numFeatures): #取出dataSet的第i列 featList=[example[i] for example in dataSet] #print("featList:") #print(featList) #弄成一個set,去掉其中相同的元素 uniqueVals=set(featList) #print("uniqueVals:") #print(uniqueVals) newEntropy=0.0 for value in uniqueVals: #按照第i列,值為value的去劃分 subDataSet=splitDataSet(dataSet,i,value) prob=len(subDataSet)/float(len(dataSet)) #計算劃分后的熵值 newEntropy+=prob*calcShannonEnt(subDataSet) infoGain=baseEntropy-newEntropy #判斷是否更優(yōu) if(infoGain>bestInfoGain): bestInfoGain=infoGain bestFeature=i #返回劃分的最優(yōu)類別 #表示按照第i列去劃分 return bestFeature #傳入的是分類名稱的列表 #返回出現(xiàn)次數(shù)最多的分類的名稱 def majorityCnt(classList): #創(chuàng)建字典,鍵值為classList中唯一值 #字典的值為classList中每隔標(biāo)簽出現(xiàn)的頻率 classCount={} for vote in classList: if vote not in classCount.keys(): classCount[vote]=0 classCount[vote]+=1 #按照字典值的順序從大到小排序 sortedClassCount=sorted(classCount,iteritems(), key=operator.itemgetter(1),reverse=True) #返回出現(xiàn)次數(shù)最多的分類的名稱 return sortedClassCount[0][0] #創(chuàng)建樹 #傳入?yún)?shù)為數(shù)據(jù)集與標(biāo)簽列表 def createTree(dataSet,labels): #得到分類名稱的標(biāo)簽"yes"or"no" #["yes", "yes", "no", "no", "no"] classList=[example[-1] for example in dataSet] #print("classList:") #print(classList) #遞歸結(jié)束的第一個條件 #所有的類標(biāo)簽完全相同 if classList.count(classList[0])==len(classList): return classList[0] #遞歸結(jié)束的第二個條件 #使用完了所有的特征,仍然不能將數(shù) #據(jù)集劃分成僅包含唯一類別的分組 #此時無法簡單地返回唯一的類標(biāo)簽, #直接返回出現(xiàn)次數(shù)最多的類標(biāo)簽 if len(dataSet[0])==1: return majorityCnt(classList) #bestFeat是最好的劃分方式對應(yīng)的列的下標(biāo) bestFeat=chooseBestFeatureToSplit(dataSet) #labels中這一列信息對應(yīng)的類別名稱 bestFeatLabel=labels[bestFeat] #樹 myTree={bestFeatLabel:{}} #將labels中的這一類別delete del(labels[bestFeat]) #這一類別對應(yīng)的列的值 featValues=[example[bestFeat] for example in dataSet] #print("featValues:") #print(featValues) #set 去掉列中相同的值 uniqueVals=set(featValues) for value in uniqueVals: #去掉最優(yōu)類別后剩下的類別 subLabels=labels[:] #print("subLabels:") #print(subLabels) #print("bestFeatLabel:") #print(bestFeatLabel) #print(value) #myTree["no surfacing"][0] #myTree["no surfacing"][2] #...... myTree[bestFeatLabel][value]=createTree( #按照第bestFeat列,值為value的去劃分 splitDataSet(dataSet,bestFeat,value),subLabels) return myTree #獲取葉節(jié)點的數(shù)目 def getNumLeafs(myTree): numLeafs=0 firstStr=list(myTree.keys())[0] secondDir=myTree[firstStr] for key in secondDir.keys(): #子節(jié)點為字典類型,則該結(jié)點也是一個判斷結(jié)點 #需要遞歸調(diào)用getNumLeafs函數(shù) if type(secondDir[key]).__name__=="dict": numLeafs+=getNumLeafs(secondDir[key]) #該結(jié)點為葉子節(jié)點,葉子數(shù)+1 else: numLeafs+=1 return numLeafs #獲取樹的層數(shù) def getTreeDepth(myTree): maxDepth=0 firstStr=list(myTree.keys())[0] secondDict=myTree[firstStr] for key in secondDict.keys(): if type(secondDict[key]).__name__=="dict": thisDepth=1+getTreeDepth(secondDict[key]) else: thisDepth=1 if thisDepth>maxDepth:maxDepth=thisDepth return maxDepth def main(): dataSet,labels=createDataSet() chooseBestFeatureToSplit(dataSet) #{"no surfacing": {0: "no", 1: {"flippers": {0: "no", 1: "yes"}}}} myTree=createTree(dataSet,labels) print("myTree:") print(myTree) createPlot(myTree) #i=getNumLeafs(myTree) #print(i) #i=getTreeDepth(myTree) #print(i) #i=chooseBestFeatureToSplit(dataSet) #print(i) #shannonEnt=calcShannonEnt(dataSet) #print(shannonEnt) #增加一個類別后再測試信息熵,發(fā)現(xiàn)熵值增大 #dataSet[0][-1]="maybe" #shannonEnt=calcShannonEnt(dataSet) #print(shannonEnt) #retDataSet=splitDataSet(dataSet,0,1) #print("retDataSet:") #print(retDataSet) #retDataSet=splitDataSet(dataSet,0,0) #print("retDataSet:") #print(retDataSet) if __name__=="__main__": main()
from math import log import operator import matplotlib.pyplot as plt #定義文本框和箭頭格式 decisionNode=dict(boxstyle="sawtooth",fc="0.8") leafNode=dict(boxstyle="round4",fc="0.8") arrow_args=dict(arrowstyle="<-") #畫樹 #使用文本注解繪制樹節(jié)點 #繪制帶箭頭的注解 def plotNode(nodeTxt,centerPt,parentPt,nodeType): createPlot.ax1.annotate(nodeTxt,xy=parentPt, xycoords="axes fraction", xytext=centerPt,textcoords="axes fraction", va="center",ha="center",bbox=nodeType, arrowprops=arrow_args) #在父子節(jié)點間填充文本信息 def plotMidText(cntrPt,parentPt,txtString): xMid=(parentPt[0]-cntrPt[0])/2.0+cntrPt[0] yMid=(parentPt[1]-cntrPt[1])/2.0+cntrPt[1] createPlot.ax1.text(xMid,yMid,txtString) def plotTree(myTree,parentPt,nodeTxt): numLeafs=getNumLeafs(myTree) depth=getTreeDepth(myTree) firstStr=list(myTree.keys())[0] cntrPt=(plotTree.xOff+(1.0+float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff) plotMidText(cntrPt,parentPt,nodeTxt) plotNode(firstStr,cntrPt,parentPt,decisionNode) secondDict=myTree[firstStr] plotTree.yOff=plotTree.yOff-1.0/plotTree.totalD for key in secondDict.keys(): if type(secondDict[key]).__name__=="dict": plotTree(secondDict[key],cntrPt,str(key)) else: plotTree.xOff=plotTree.xOff+1.0/plotTree.totalW plotNode(secondDict[key],(plotTree.xOff,plotTree.yOff), cntrPt,leafNode) plotMidText((plotTree.xOff,plotTree.yOff), cntrPt,str(key)) plotTree.yOff=plotTree.yOff+1.0/plotTree.totalD def createPlot(inTree): fig=plt.figure(1,facecolor="white") fig.clf() axprops=dict(xticks=[],yticks=[]) createPlot.ax1=plt.subplot(111,frameon=False,**axprops) plotTree.totalW=float(getNumLeafs(inTree)) plotTree.totalD=float(getNumLeafs(inTree)) plotTree.xOff=-0.5/plotTree.totalW;plotTree.yOff=1.0; plotTree(inTree,(0.5,1.0),"") plt.show() #創(chuàng)建數(shù)據(jù)集 def createDataSet(): dataSet=[[1,1,"yes"], [1,1,"yes"], [1,0,"no"], [0,1,"no"], [0,1,"no"]] labels=["no surfacing","flippers"] return dataSet,labels #計算給定數(shù)據(jù)的香農(nóng)熵 #熵值越高,混合的數(shù)據(jù)越多,越無序 #我們可以在數(shù)據(jù)集中添加更多的分類 def calcShannonEnt(dataSet): numEntries=len(dataSet) #數(shù)據(jù)字典,鍵值為最后一列的數(shù)值"yes"or"no" labelCounts={} for featVec in dataSet: #為所有可能分類創(chuàng)建字典 #"yes"or"no" currentLabel=featVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel]=0 labelCounts[currentLabel]+=1 shannonEnt=0.0 for key in labelCounts: prob=float(labelCounts[key])/numEntries #以2為?求對數(shù) shannonEnt-=prob*log(prob,2) return shannonEnt #按照給定特征劃分?jǐn)?shù)據(jù)集 #輸入的參數(shù)為:待劃分的數(shù)據(jù)集, #劃分?jǐn)?shù)據(jù)集的特征(第幾列), #特征的返回值(這一列的值為多少) #返回的是符合這一列的值的每一行, #并且將這一列的數(shù)據(jù)去掉了 def splitDataSet(dataSet,axis,value): retDataSet=[] #遍歷整個數(shù)據(jù)集 #featVec:[1, 1, "yes"] for featVec in dataSet: #print("featVec:") #print(featVec) #抽取其中符合特征的 #featVec[axis]表示[1, 1, "yes"]中的第axis+1個 if featVec[axis]==value: #保存這一列前面的數(shù)據(jù) reducedFeatVec=featVec[:axis] #print("reducedFeatVec:") #print(reducedFeatVec) #保存這一列后面的數(shù)據(jù) reducedFeatVec.extend(featVec[axis+1:]) #print("reducedFeatVec:") #print(reducedFeatVec) retDataSet.append(reducedFeatVec) #print("retDataSet:") #print(retDataSet) return retDataSet #選擇最好的數(shù)據(jù)集劃分方式 def chooseBestFeatureToSplit(dataSet): #numFeatures:2 numFeatures=len(dataSet[0])-1 #計算香農(nóng)熵 baseEntropy=calcShannonEnt(dataSet) bestInfoGain=0.0 bestFeature=-1 #i:0,1 for i in range(numFeatures): #取出dataSet的第i列 featList=[example[i] for example in dataSet] #print("featList:") #print(featList) #弄成一個set,去掉其中相同的元素 uniqueVals=set(featList) #print("uniqueVals:") #print(uniqueVals) newEntropy=0.0 for value in uniqueVals: #按照第i列,值為value的去劃分 subDataSet=splitDataSet(dataSet,i,value) prob=len(subDataSet)/float(len(dataSet)) #計算劃分后的熵值 newEntropy+=prob*calcShannonEnt(subDataSet) infoGain=baseEntropy-newEntropy #判斷是否更優(yōu) if(infoGain>bestInfoGain): bestInfoGain=infoGain bestFeature=i #返回劃分的最優(yōu)類別 #表示按照第i列去劃分 return bestFeature #傳入的是分類名稱的列表 #返回出現(xiàn)次數(shù)最多的分類的名稱 def majorityCnt(classList): #創(chuàng)建字典,鍵值為classList中唯一值 #字典的值為classList中每隔標(biāo)簽出現(xiàn)的頻率 classCount={} for vote in classList: if vote not in classCount.keys(): classCount[vote]=0 classCount[vote]+=1 #按照字典值的順序從大到小排序 sortedClassCount=sorted(classCount,iteritems(), key=operator.itemgetter(1),reverse=True) #返回出現(xiàn)次數(shù)最多的分類的名稱 return sortedClassCount[0][0] #創(chuàng)建樹 #傳入?yún)?shù)為數(shù)據(jù)集與標(biāo)簽列表 def createTree(dataSet,labels): #得到分類名稱的標(biāo)簽"yes"or"no" #["yes", "yes", "no", "no", "no"] classList=[example[-1] for example in dataSet] #print("classList:") #print(classList) #遞歸結(jié)束的第一個條件 #所有的類標(biāo)簽完全相同 if classList.count(classList[0])==len(classList): return classList[0] #遞歸結(jié)束的第二個條件 #使用完了所有的特征,仍然不能將數(shù) #據(jù)集劃分成僅包含唯一類別的分組 #此時無法簡單地返回唯一的類標(biāo)簽, #直接返回出現(xiàn)次數(shù)最多的類標(biāo)簽 if len(dataSet[0])==1: return majorityCnt(classList) #bestFeat是最好的劃分方式對應(yīng)的列的下標(biāo) bestFeat=chooseBestFeatureToSplit(dataSet) #labels中這一列信息對應(yīng)的類別名稱 bestFeatLabel=labels[bestFeat] #樹 myTree={bestFeatLabel:{}} #將labels中的這一類別delete del(labels[bestFeat]) #這一類別對應(yīng)的列的值 featValues=[example[bestFeat] for example in dataSet] #print("featValues:") #print(featValues) #set 去掉列中相同的值 uniqueVals=set(featValues) for value in uniqueVals: #去掉最優(yōu)類別后剩下的類別 subLabels=labels[:] #print("subLabels:") #print(subLabels) #print("bestFeatLabel:") #print(bestFeatLabel) #print(value) #myTree["no surfacing"][0] #myTree["no surfacing"][4] #...... myTree[bestFeatLabel][value]=createTree( #按照第bestFeat列,值為value的去劃分 splitDataSet(dataSet,bestFeat,value),subLabels) return myTree #獲取葉節(jié)點的數(shù)目 def getNumLeafs(myTree): numLeafs=0 firstStr=list(myTree.keys())[0] secondDir=myTree[firstStr] for key in secondDir.keys(): #子節(jié)點為字典類型,則該結(jié)點也是一個判斷結(jié)點 #需要遞歸調(diào)用getNumLeafs函數(shù) if type(secondDir[key]).__name__=="dict": numLeafs+=getNumLeafs(secondDir[key]) #該結(jié)點為葉子節(jié)點,葉子數(shù)+1 else: numLeafs+=1 return numLeafs #獲取樹的層數(shù) def getTreeDepth(myTree): maxDepth=0 firstStr=list(myTree.keys())[0] secondDict=myTree[firstStr] for key in secondDict.keys(): if type(secondDict[key]).__name__=="dict": thisDepth=1+getTreeDepth(secondDict[key]) else: thisDepth=1 if thisDepth>maxDepth:maxDepth=thisDepth return maxDepth #使用決策樹的分類函數(shù) def classify(inputTree,featLabels,testVec): firstStr=list(inputTree.keys())[0] secondDict=inputTree[firstStr] #將標(biāo)簽字符串轉(zhuǎn)換為索引 featIndex=featLabels.index(firstStr) for key in secondDict.keys(): if testVec[featIndex]==key: if type(secondDict[key]).__name__=="dict": classLabel=classify(secondDict[key],featLabels,testVec) else: classLabel=secondDict[key] return classLabel #使用pickle模塊存儲決策樹 def storeTree(inputTree,filename): import pickle fw=open(filename,"wb") pickle.dump(inputTree,fw) fw.close() #使用pickle模塊加載樹 def grabTree(filename): import pickle fr=open(filename,"rb") return pickle.load(fr) #使用決策樹預(yù)測隱形眼鏡類型 def predictTypes(): fr=open("lenses.txt") #[["young", "myope", "no", "reduced", "no lenses"], ...] lenses=[inst.strip().split(" ") for inst in fr.readlines()] #print(lenses) #標(biāo)簽 lensesLabels=["age","prescript","astigmatic","tearRate"] #創(chuàng)建決策樹 lensesTree=createTree(lenses,lensesLabels) print(lensesTree) #畫樹 createPlot(lensesTree) def main(): predictTypes() #dataSet,labels=createDataSet() #print(labels) #chooseBestFeatureToSplit(dataSet) #{"no surfacing": {0: "no", 1: {"flippers": {0: "no", 1: "yes"}}}} #myTree=createTree(dataSet,labels) #storeTree(myTree,"classifierStorage.txt") #Tree=grabTree("classifierStorage.txt") #print(Tree) #createPlot(Tree) #print("myTree:") #print(myTree) #createPlot(myTree) #labels2=["no surfacing", "flippers"] #i=classify(myTree,labels2,[1,1]) #print(i) #i=getNumLeafs(myTree) #print(i) #i=getTreeDepth(myTree) #print(i) #i=chooseBestFeatureToSplit(dataSet) #print(i) #shannonEnt=calcShannonEnt(dataSet) #print(shannonEnt) #增加一個類別后再測試信息熵,發(fā)現(xiàn)熵值增大 #dataSet[0][-1]="maybe" #shannonEnt=calcShannonEnt(dataSet) #print(shannonEnt) #retDataSet=splitDataSet(dataSet,0,1) #print("retDataSet:") #print(retDataSet) #retDataSet=splitDataSet(dataSet,0,0) #print("retDataSet:") #print(retDataSet) if __name__=="__main__": main()數(shù)據(jù)
lenses.txt文件內(nèi)容如下
young myope no reduced no lenses young myope no normal soft young myope yes reduced no lenses young myope yes normal hard young hyper no reduced no lenses young hyper no normal soft young hyper yes reduced no lenses young hyper yes normal hard pre myope no reduced no lenses pre myope no normal soft pre myope yes reduced no lenses pre myope yes normal hard pre hyper no reduced no lenses pre hyper no normal soft pre hyper yes reduced no lenses pre hyper yes normal no lenses presbyopic myope no reduced no lenses presbyopic myope no normal no lenses presbyopic myope yes reduced no lenses presbyopic myope yes normal hard presbyopic hyper no reduced no lenses presbyopic hyper no normal soft presbyopic hyper yes reduced no lenses presbyopic hyper yes normal no lenses
文章版權(quán)歸作者所有,未經(jīng)允許請勿轉(zhuǎn)載,若此文章存在違規(guī)行為,您可以聯(lián)系管理員刪除。
轉(zhuǎn)載請注明本文地址:http://specialneedsforspecialkids.com/yun/43459.html
摘要:總言言之,決策樹第一個是需要從大量的已存在的樣本中推出可供做決策的規(guī)則,同時,這個規(guī)則應(yīng)該避免做無謂的損耗。算法原理構(gòu)造決策樹的關(guān)鍵步驟是分裂屬性。這時分裂屬性可能會遇到三種不同的情況對離散值生成非二叉決策樹。對離散值生成二叉決策樹。 算法背景 決策樹故名思意是用于基于條件來做決策的,而它運行的邏輯相比一些復(fù)雜的算法更容易理解,只需按條件遍歷樹就可以了,需要花點心思的是理解如何建立決策...
摘要:決策樹分支轉(zhuǎn)存寫代碼的方法今天是周日,我還在倒騰決策樹,然后發(fā)現(xiàn)了一個不用裝軟件也能倒的方法,而且更簡單。剛開始看視頻的時候是看的的視頻,講的真差,太模糊了,不適合我。 決策樹分支dot轉(zhuǎn)存pdf 1、寫代碼的方法 今天是周日,我還在倒騰決策樹,然后發(fā)現(xiàn)了一個不用裝軟件也能倒pdf的方法,而且更簡單。參照了這個中文的文檔實現(xiàn):http://sklearn.apachecn.org/c....
閱讀 2413·2021-08-18 10:21
閱讀 2519·2019-08-30 13:45
閱讀 2155·2019-08-30 13:16
閱讀 2100·2019-08-30 12:52
閱讀 1363·2019-08-30 11:20
閱讀 2622·2019-08-29 13:47
閱讀 1622·2019-08-29 11:22
閱讀 2760·2019-08-26 12:11