机器学习识别验证码与百度莱茨狗

最近很火的百度莱茨狗想必大家都有接触。不知道大家都有几只史诗狗了？
0x01 领狗
莱茨狗4只狗免费领取地址分享
领取地址1：https://pet-chain.baidu.com/chain/splash
领取地址2：https://pet-chain.baidu.com/chain/splash?appId=2&tpl=wallet
领取地址3：https://pet-chain.baidu.com/chain/splash?appId=3&tpl=wallet
领取地址4：https://pet-chain.baidu.com/chain/splash?appId=4&tpl=wallet

不过我昨天只领到了两只。

0x02 刷市场买便宜狗

# -*- coding: utf-8 -*-
import requests
import json
import time

headers = {
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36',
    'Content-Type': 'application/json',
    'Cookie': '',  # Cookie替换成自己的. 
    'Origin': 'https://pet-chain.baidu.com',
    'Referer': 'https://pet-chain.baidu.com/chain/dogMarket'
    }

class pet_chain:
    def getInfo(petId):  #获取宠物信息
        url = "https://pet-chain.baidu.com/data/pet/queryPetByIdWithAuth"
        data = json.dumps({"petId":petId,"requestId":1517813967221,"appId":1,"tpl":""})
        req = requests.post(url, headers=headers, data=data)
        text = json.loads(req.content)
        #print text['data']['userName']
        #print text['data']['id']
        #print text['data']['rareDegree']
        #print text['data']['amount']
        #print text['data']['ethAddr']
        return {'userName':text['data']['userName'],'id':text['data']['id'],'rareDegree':text['data']['rareDegree'],'amount':text['data']['amount'],'ethAddr':text['data']['ethAddr'] }

    def buy(self,petId): #购买宠物
        url = 'https://pet-chain.baidu.com/data/txn/create'
        data = json.dumps({"petId":petId,"requestId":1517818584013,"appId":1,"tpl":""})
        req = requests.post(url, headers=headers, data=data)
        print req.content


    def getMarket(self): #搜索市场
        url = 'https://pet-chain.baidu.com/data/market/queryPetsOnSale'
        data = json.dumps({"pageNo":2,"pageSize":2000,"querySortType":"AMOUNT_ASC","petIds":[]}) #按价钱从低到高排序。每页2000只狗。
        req = requests.post(url, headers=headers, data=data)
        text = json.loads(req.content)
        for pet in text['data']['petsOnSale']:
            if(pet['rareDegree'] > 1):  #只要卓越以上的
                #print pet
                print '品种:' , pet['rareDegree']  #0 - 5 分别为 普通，优秀，卓越， 史诗  后面的没见过。
                print '价钱：' , pet['amount']
                print 'id:' , pet['id']
                print 'petId:' , pet['petId']

                #self.buy(pet['petId']) #是否自动购买。


if __name__ == '__main__':
    w = pet_chain()
    #w.buy('1858371085567322557')  #调用购买

    w.getMarket() # 市场查询

通过这个脚本可以买到便宜的狗。但是后来百度对市场进行了限制。
1、每页只能显示20只狗
2、买狗需要验证码

0x03 基于SVM的机器学习识别验证码

1、批量获取验证码

# -*- coding: utf-8 -*-
import requests
import json
import base64
import time

headers = {
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36',
    'Content-Type': 'application/json',
    'Cookie':XXX,
    'Origin': 'https://pet-chain.baidu.com',
    'Referer': 'https://pet-chain.baidu.com/chain/dogMarket'
    }

class captcha:
    def __init__(self):
        return

    def getCaptcha(self):
        url = 'https://pet-chain.baidu.com/data/captcha/gen'
        data = json.dumps({"requestId":1517879979928,"appId":1,"tpl":""})
        req = requests.post(url, headers=headers, data=data, timeout=2)
        return json.loads(req.content)['data']['img']

        
if __name__=='__main__':
    w = captcha()
# 批量抓取样本
    while 1:
        imgdata = ''
        try:
            img = w.getCaptcha()
        except:
            pass
        imgdata = base64.b64decode(img)  
        file = open('img/' + str(time.time()) + '.jpg','wb')  
        file.write(imgdata)  
        file.close()

2、打码
我手动打的，根据验证码内容把图片改成 ADTF.jpg

3、机器学习

# -*- coding: utf-8 -*-
import os
import cv2
import time
import joblib
import numpy as np
from PIL import Image
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
from sklearn.svm import SVC

class learn:
    def __init__(self):
        self.goodCaseDir = './img/good/' #这里放打完码的图片
        self.learnJobDir = './img/job/'
        self.noneDir = './img/none/' #这里是没打码的图片
        return

    def makeCase(self):
        files = os.listdir(self.goodCaseDir)
        for name in files:
            im = Image.open(self.goodCaseDir + name)
            im1 = im.crop((50,10,97,80))
            im2 = im.crop((95,10,144,80))
            im3 = im.crop((142,10,188,80))
            im4 = im.crop((185,10,235,80))
            txt = name.split('.')[0]
            im1.save(self.learnJobDir + txt[0]+str(time.time())+'.jpg')
            im2.save(self.learnJobDir + txt[1]+str(time.time())+'.jpg')
            im3.save(self.learnJobDir + txt[2]+str(time.time())+'.jpg')
            im4.save(self.learnJobDir + txt[3]+str(time.time())+'.jpg')

    def extractLetters(self,path):
        x = []
        y = []
        files = os.listdir(self.learnJobDir)
        for name in files:
            x.append(self.getletter(self.learnJobDir + name))
            y.append(name[0].lower())
        return x, y

    def getletter(self,fn):
        fnimg = cv2.imread(fn)  # 读取图像
        img = cv2.resize(fnimg, (100, 100))  # 将图像大小调整
        alltz = []
        for now_h in range(0, 100):
            xtz = []
            for now_w in range(0, 100):
                b = img[now_h, now_w, 0]
                g = img[now_h, now_w, 1]
                r = img[now_h, now_w, 2]
                btz = 255 - b
                gtz = 255 - g
                rtz = 255 - r
                if btz > 0 or gtz > 0 or rtz > 0:
                    nowtz = 1
                else:
                    nowtz = 0
                xtz.append(nowtz)
            alltz += xtz
        return alltz
            
    def trainSVM(self):
        array = self.extractLetters(self.learnJobDir)
        #print array
        letterSVM = SVC(kernel="linear",C=1).fit(array[0], array[1])
        joblib.dump(letterSVM, 'letter.pkl')

    def ocrImg(self,fileName):
        clf = joblib.load('letter.pkl')
        img = Image.open(fileName)
        data = self.getletter(fileName)
        data = np.array([data])
        #print(data)
        oneLetter = clf.predict(data)[0]
        return oneLetter

    def run(self):
        files = os.listdir(self.noneDir)
        for name in files:
            im = Image.open(self.noneDir + name)
            im.show()
            im1 = im.crop((50,10,97,80))
            im2 = im.crop((95,10,144,80))
            im3 = im.crop((142,10,188,80))
            im4 = im.crop((185,10,235,80))
            txt = name.split('.')[0]
            im1.save('./img/tmp/1.jpg')
            im2.save('./img/tmp/2.jpg')
            im3.save('./img/tmp/3.jpg')
            im4.save('./img/tmp/4.jpg')
            tmpfiles = os.listdir('./img/tmp/')
            for tmpfile in tmpfiles:
                print self.ocrImg('./img/tmp/' + tmpfile)
            raw_input()

    def main(self,im):
        im1 = im.crop((50,10,97,80))
        im2 = im.crop((95,10,144,80))
        im3 = im.crop((142,10,188,80))
        im4 = im.crop((185,10,235,80))

        im1.save('./img/tmp/1.jpg')
        im2.save('./img/tmp/2.jpg')
        im3.save('./img/tmp/3.jpg')
        im4.save('./img/tmp/4.jpg')
        tmpfiles = os.listdir('./img/tmp/')
        vcode = []
        for tmpfile in tmpfiles:
             vcode.append(self.ocrImg('./img/tmp/' + tmpfile))
        return ''.join(vcode)

        
    
if __name__ == '__main__':
    w = learn()
    print '开始对正样本切图.....'
    #w.makeCase()
    print '样本切图完成，开始进行学习建模.....'
    #w.trainSVM()
    print '建模完成'
    #w.ocrImg('./img/3.jpg')
    print '开始识别验证码'
    #w.run()
    im = Image.open('code.jpg')
    print w.main(im)

0x04 再次刷狗
整合一下，这次只发部分代码


    def buy(self,petId,validCode):
        cap_url = 'https://pet-chain.baidu.com/data/captcha/gen'
        data = json.dumps({"requestId":1517889037133,"appId":1,"tpl":""})
        req = requests.post(cap_url, headers=headers, data=data, timeout=2)
        img = json.loads(req.content)['data']['img']
        image_string = cStringIO.StringIO(base64.b64decode(img))
        im = Image.open(image_string)
        captcha = q.main(im)
        print captcha
        url = 'https://pet-chain.baidu.com/data/txn/create'
        data = json.dumps({"petId":petId,"captcha":captcha,"validCode":validCode,"requestId":1517888890852,"appId":1,"tpl":""})
        req = requests.post(url, headers=headers, data=data, timeout=2)
        print json.loads(req.content)['errorMsg']
        print req.content


    def getMarket(self,page):
        url = 'https://pet-chain.baidu.com/data/market/queryPetsOnSale'
        i = 1
        while i <= page:
            print i
            try:
                data = json.dumps({"pageNo":page,"pageSize":1,"querySortType":"AMOUNT_ASC","petIds":[],"requestId":1517887362977,"appId":1,"tpl":""})
                req = requests.post(url, headers=headers, data=data, timeout=5)
                #print req.content
                text = json.loads(req.content)

                for pet in text['data']['petsOnSale']:
                    if(pet['rareDegree'] >= 0):# and float(pet['amount']) <= 1800) :
                        print pet
                        print '品种:' , pet['rareDegree']
                        print '价钱：' , pet['amount']
                        print 'id:' , pet['id']
                        print 'petId:' , pet['petId']
                i += 1
            except:
                pass
                time.sleep(2)
        return {'petId': pet['petId'],'validCode': pet['validCode']}

最后我刷到了两只史诗狗、三只卓越狗。希望大家也能有所收获。【技术上的！】

机器学习识别验证码与百度莱茨狗

推荐阅读更多精彩内容