from IPython.display import Image
from numpy import genfromtxt, savetxt
Image(filename='data/poker.jpg')
Image(filename='data/poker.png')
def histogram(l):
h={}
for x in l:
if x in h:
h[x]+=1
else:
h[x] = 1
return h
def hOfH(l, size=None):
h = histogram(l)
h = histogram(dict.values(h))
result = dict.values(h)
result.sort(reverse=True)
if(size is not None):
while(len(result)<size):
result += [0]
while len(result) > size:
result.pop()
return result
def handToFeatures(hand):
suits = [hand[0], hand[2], hand[4], hand[6], hand[8]]
ranks = [hand[1], hand[3], hand[5], hand[7], hand[9]]
sh = hOfH(suits, 4)
flush = int(sh[0] == 1 and sh[1] == 0)
h = dict.values(histogram(ranks))
h.sort(reverse=True)
kind1 = h[0]
kind2 = h[1]
ranks.sort()
if(1 in ranks):
high = 1
low = ranks[0]
if low == 1:
low = ranks[1]
else:
high = ranks[-1]
low = ranks[0]
normalized = [(r - low + 13)%13 for r in ranks]
normalized.sort()
straight = normalized[-1]==4
return [flush, kind1, kind2, high, low, straight]
def main():
#create the training & test sets, skipping the header row with [1:]
train = genfromtxt(open('data/train.csv','r'), delimiter=',', dtype='f8')[1:]
test = genfromtxt(open('data/test.csv','r'), delimiter=',', dtype='f8')[1:]
ptrain = []
for x in train:
y = x[-1]
f = handToFeatures(x[:-1])
ptrain.append(f+[y])
savetxt("data/trainPrep.csv",ptrain, delimiter=',', fmt='%d,%d,%d,%d,%d,%d,%d',
header='flush,kind1,kind2,high,low,straight,hand', comments = '')
ptest = []
for x in test:
f = handToFeatures(x[1:])
id = x[0]
ptest.append([id]+f)
savetxt("data/testPrep.csv",ptest, delimiter=',', fmt='%d,%d,%d,%d,%d,%d,%d',
header='id,flush,kind1,kind2,high,low,straight', comments = '')
train = genfromtxt(open('data/train.csv','r'), delimiter=',', dtype='f8')[1:]
train
array([[ 4., 9., 2., ..., 2., 8., 0.], [ 1., 4., 3., ..., 2., 7., 0.], [ 1., 11., 4., ..., 2., 1., 2.], ..., [ 1., 8., 4., ..., 2., 13., 0.], [ 4., 12., 3., ..., 4., 6., 0.], [ 1., 1., 1., ..., 4., 2., 1.]])
ptrain = []
for x in train:
y = x[-1]
f = handToFeatures(x[:-1])
ptrain.append(f+[y])
hand = train[-1]
suits = [train[-1][0], train[-1][2], train[-1][4], train[-1][6], train[-1][8]]
ranks = [train[-1][1], train[-1][3], train[-1][5], train[-1][7], train[-1][9]]
len(hand)
11
hand
array([ 1., 1., 1., 3., 1., 7., 1., 2., 4., 2., 1.])
suits
[1.0, 1.0, 1.0, 1.0, 4.0]
ranks
[1.0, 3.0, 7.0, 2.0, 2.0]
sh = hOfH(suits, 4)
sh
[1, 1, 0, 0]
h = histogram(suits) #각 조합의 갯수 (하트, 다이아, 다이아몬드, 클로버)
h #아 하트가 4개고 클로버가 1개네.
{1.0: 4, 4.0: 1}
h = histogram(dict.values(h))
h #조합 종류 갯수
{1: 1, 4: 1}
result = dict.values(h)
result
[1, 1]
result.sort(reverse=True)
len(result)
2
if(4 is not None):
while(len(result)<4):
result += [0]
while len(result) > 4:
result.pop()
result
[1, 1, 0, 0]
flush = int(result[0] == 1 and result[1] == 0) #조합의 갯수가 1개밖에 없을때는 5개가 다 동일한 조합일 경우 그러므러 플러쉬
flush
0
h = dict.values(histogram(ranks)) #이번에는 랭크(카드번호)의 고유값을 찾아보자
h #똑같은 랭크 숫자가 2개가 있고 나머지 3개는 다 다른 랭크 숫자.
[1, 2, 1, 1]
h.sort(reverse=True)
h
[2, 1, 1, 1]
kind1 = h[0] #kind1이 2이면 원페어
kind2 = h[1] #kind2까지 2이면 투페어
kind1
2
kind2
1
ranks.sort()
ranks
[1.0, 2.0, 2.0, 3.0, 7.0]
if(1 in ranks):
high = 1
low = ranks[0]
if low == 1:
low = ranks[1]
else:
high = ranks[-1]
low = ranks[0]
low #high =1 이라는 것은 ACE를 뜻하므로 첫 번째 랭크의 숫자가 1이면 당연히 가장 큰 값. 이럴때는 당연히 두번 째 랭크 숫자가 가장 작은값, 첫번째 랭크 숫자가 1이 아닐경우엔 첫번째가 가장 작은 숫자겠죠.
2.0
high
1
normalized = [(r - low + 13)%13 for r in ranks] #나머지를 구하고 소팅을 해서 마지막 값이 4이면 1,2,3,4,5처럼 연속적인 숫자면 5-1이 4니깐 그럴 경우엔 스트레이트 처리
normalized
[12.0, 0.0, 0.0, 1.0, 5.0]
normalized.sort()
normalized
[0.0, 0.0, 1.0, 5.0, 12.0]
straight = normalized[-1]==4
straight
False
#!/usr/bin/python
import scipy as sp
def llfun(act, pred):
epsilon = 1e-15
pred = sp.maximum(epsilon, pred)
pred = sp.minimum(1-epsilon, pred)
ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
ll = ll * -1.0/len(act)
return ll
#!/usr/bin/python
from sklearn.ensemble import *
from sklearn import svm
from sklearn import cross_validation
import numpy as np
def main():
classifier = GradientBoostingClassifier
#read in data, parse into training and target sets
dataset = np.genfromtxt(open('data/trainPrep.csv','r'), delimiter=',', dtype='f8')[1:]
target = np.array([x[-1] for x in dataset])
train = np.array([x[:-1] for x in dataset])
#In this case we'll use a random forest, but this could be any classifier
cfr = classifier()
#Simple K-Fold cross validation. 5 folds.
cv = cross_validation.KFold(len(train), n_folds=5, shuffle=True)
#iterate through the training and test cross validation segments and
#run the classifier on each one, aggregating the results into a list
results = []
for traincv, testcv in cv:
predicted = cfr.fit(train[traincv], target[traincv]).predict(train[testcv])
total=len(predicted)
correct=0.0
for i in range(total):
if target[testcv][i] == predicted[i]:
correct += 1.0
results.append(correct/total)
#print out the mean of the cross-validated results
print "Results: " + str( np.array(results).mean() )
if __name__=="__main__":
main()
Results: 0.999520191923
import re
import os
def createNamesDatabase(*names):
"""
>>> db = createNamesDatabase("abc", "123", "abc123", "abc_123")
>>> "abc" in db
True
>>> "abc123" in db
True
>>> "abC123" in db
False
"""
result = set()
for arg in names:
if(isinstance(arg, str)):
result.add(arg)
else:
try:
result.add(set(arg))
except:
result.add(str(arg))
return result
def addToNamesDatabase(db, *names):
for arg in names:
if(isinstance(arg, str)):
db.add(arg)
else:
try:
db.add(set(arg))
except:
db.add(str(arg))
def splitName(name, numPattern=r'[0-9]+', delimPattern=r'[-._]+'):
parts = re.split("("+delimPattern+")", name)
moreParts = []
indexOfNums = []
for part in parts:
x = re.split("("+numPattern+")", part)
for p in x:
if(re.match(numPattern, p)):
indexOfNums.append(len(moreParts))
moreParts.append(p)
return moreParts, indexOfNums
def incrementName(name, numPattern=r'[0-9]+', delimPattern=r'[-._]+', numDigits=4, defaultDelim="_"):
parts, nums = splitName(name)
if(len(nums)==0):
return incrementName(name + defaultDelim + ("0"*numDigits), numPattern, delimPattern, numDigits, defaultDelim)
i = nums[-1]
l = len(parts[i])
parts[i] = str(int(parts[i]) + 1)
while(len(parts[i])<l):
parts[i] = "0" + parts[i]
return "".join(parts)
def uniquifyName(name, db, numPattern=r'[0-9]+', delimPattern=r'[-._]+', numDigits=4, defaultDelim="_"):
"""
>>> db = createNamesDatabase("abc", "123", "abc123", "abc_123", "xyz_123_abc")
>>> uniquifyName("123", db)
'124'
>>> uniquifyName("123", db)
'125'
>>> uniquifyName("123", db)
'126'
>>> uniquifyName("127", db)
'127'
>>> uniquifyName("xyz_123_abc", db)
'xyz_124_abc'
>>> uniquifyName("abc", db)
'abc_0001'
"""
n = name
while(n in db):
n = incrementName(n)
addToNamesDatabase(db, n)
return n
def generateUniqueFilename(path):
if(not os.path.exists(path)):
return path
base = os.path.basename(path)
directory = os.path.dirname(path)
root, ext = os.path.splitext(base)
files = os.listdir(directory)
if(not re.search(r'[0-9]+', root)):
root += "_0000"
db = createNamesDatabase()
for f in files:
b = os.path.basename(f)
r, e = os.path.splitext(b)
addToNamesDatabase(db, r)
return os.path.join(directory, uniquifyName(root, db)+ ext)
def _test():
import doctest
doctest.testmod()
if __name__ == "__main__":
_test()
#!/usr/bin/python
from sklearn import svm
from numpy import genfromtxt, savetxt
def main():
clf = svm.SVC(gamma=0.001, C=100)
#create the training & test sets, skipping the header row with [1:]
dataset = genfromtxt(open('data/trainPrep.csv','r'), delimiter=',', dtype='f8')[1:]
target = [x[-1] for x in dataset]
train = [x[:-1] for x in dataset]
test = genfromtxt(open('data/testPrep.csv','r'), delimiter=',', dtype='f8')[1:]
test = [x[1:] for x in test]
svc=clf.fit(train, target)
predicted = []
i = 1
for x in test:
y = clf.predict(x)
predicted.append([i,y])
i += 1
filename = generateUniqueFilename("data/submission.csv")
print "Generating submission: " + filename
savetxt(filename, predicted, delimiter=',', fmt='%d,%d',
header='id,hand', comments = '')
if __name__=="__main__":
main()
Generating submission: data\submission_0000.csv