python：狂抓“某逼乎”精彩話題，撞破“某榴”新手賬號

Enlightenment 發布于2019-07-30 18:38 / 1591人閱讀

摘要：上一篇寫了個有意思的文章腳本撞庫國內某榴賬號很多朋友反映，該榴賬號有驗證，即時撞破賬號也無卵用，其實新手號還是可以使用的，至于撞庫破解某榴賬號的問題請移到上篇帖子查看。華麗分割線這次再來研究下如何搞定某逼乎的話題問題。

上一篇寫了個有意思的文章：python 腳本撞庫國內“某榴”賬號 https://www.52pojie.cn/thread...
很多朋友反映，該榴賬號有google驗證，即時撞破賬號也無卵用，其實新手號還是可以使用的，至于撞庫破解“某榴”賬號的問題請移到上篇帖子查看。

華麗分割線

這次再來研究下如何搞定“某逼乎”的話題問題。
逼乎現在在整個社區類網站中可以說火的不要不要的，逼乎上的內容質量在所有社區中還是相對較高的，很多時候我們都需要爬取逼乎精彩的話題，當然這不是為了裝，搞不好你的設計恰好
就需要這么一個需求。

程序猿之間上代碼，一起研究下：

"""
@author:haoning
[url=home.php?mod=space&uid=365491]@create[/url] time:2015.8.5
"""
from future import division # 精確除法
from Queue import Queue
from builtin import False
import json
import os
import re
import platform
import uuid
import urllib
import urllib2
import sys
import time
import MySQLdb as mdb
from bs4 import BeautifulSoup

reload(sys)
sys.setdefaultencoding( "utf-8" )

headers = {
"User-Agent" : "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0",
"Content-Type":"application/x-www-form-urlencoded; charset=UTF-8",
"X-Requested-With":"XMLHttpRequest",
"Referer":"https://www.zhihu.com/topics",
"Cookie":"__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a"
}

DB_HOST = "127.0.0.1"
DB_USER = "root"
DB_PASS = "root"

queue= Queue() #接收隊列
nodeSet=set()
keywordSet=set()
stop=0
offset=-20
level=0
maxLevel=7
counter=0
base=""

conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, "zhihu", charset="utf8")
conn.autocommit(False)
curr = conn.cursor()

def get_html(url):

try:
    req = urllib2.Request(url)
    response = urllib2.urlopen(req,None,3) #在這里應該加入代{過}{濾}理
    html = response.read()
    return html
except:
    pass
return None

def getTopics():

url = "https://www.zhihu.com/topics"
print url
try:
    req = urllib2.Request(url)
    response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞�
    html = response.read().decode("utf-8")
    print html
    soup = BeautifulSoup(html)
    lis = soup.find_all("li", {"class" : "zm-topic-cat-item"})
     
    for li in lis:
        data_id=li.get("data-id")
        name=li.text
        curr.execute("select id from classify_new where name=%s",(name))
        y= curr.fetchone()
        if not y:
            curr.execute("INSERT INTO classify_new(data_id,name)VALUES(%s,%s)",(data_id,name))
    conn.commit()
except Exception as e:
    print "get topic error",e

def get_extension(name):

where=name.rfind(".")
if where!=-1:
    return name[where:len(name)]
return None

def which_platform():

sys_str = platform.system()
return sys_str

def GetDateString():

when=time.strftime("%Y-%m-%d",time.localtime(time.time()))
foldername = str(when)
return foldername

def makeDateFolder(par,classify):

try:
    if os.path.isdir(par):
        newFolderName=par + "http://" + GetDateString() + "http://"  +str(classify)
        if which_platform()=="Linux":
            newFolderName=par + "/" + GetDateString() + "/" +str(classify)
        if not os.path.isdir( newFolderName ):
            os.makedirs( newFolderName )
        return newFolderName
    else:
        return None
except Exception,e:
    print "kk",e
return None

def download_img(url,classify):

try:
    extention=get_extension(url)
    if(extention is None):
        return None
    req = urllib2.Request(url)
    resp = urllib2.urlopen(req,None,3)
    dataimg=resp.read()
    name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention
    top="E://topic_pic"
    folder=makeDateFolder(top, classify)
    filename=None
    if folder is not None:
        filename  =folder+"http://"+name
    try:
        if "e82bab09c_m" in str(url):
            return True
        if not os.path.exists(filename):
            file_object = open(filename,"w+b")
            file_object.write(dataimg)
            file_object.close()
            return "/room/default/"+GetDateString()+"/"+str(classify)+"/"+name
        else:
            print "file exist"
            return None
    except IOError,e1:
        print "e1=",e1
        pass
except Exception as e:
    print "eee",e
    pass
return None #如果沒有下載下來就利用原來網站的鏈接

def getChildren(node,name):

global queue,nodeSet
try:
    url="https://www.zhihu.com/topic/"+str(node)+"/hot"
    html=get_html(url)
    if html is None:
        return
    soup = BeautifulSoup(html)
    p_ch="父話題"
    node_name=soup.find("div", {"id" : "zh-topic-title"}).find("h1").text
    topic_cla=soup.find("div", {"class" : "child-topic"})
    if topic_cla is not None:
        try:
            p_ch=str(topic_cla.text)
            aList = soup.find_all("a", {"class" : "zm-item-tag"}) #獲取所有子節點
            if u"子話題" in p_ch:
                for a in aList:
                    token=a.get("data-token")
                    a=str(a).replace("
","").replace("	","").replace("
","")
                    start=str(a).find(">")
                    end=str(a).rfind("")
                    new_node=str(str(a)[start+1:end])
                    curr.execute("select id from rooms where name=%s",(new_node)) #先保證名字絕不相同
                    y= curr.fetchone()
                    if not y:
                        print "y=",y,"new_node=",new_node,"token=",token
                        queue.put((token,new_node,node_name))
        except Exception as e:
            print "add queue error",e
except Exception as e:
    print "get html error",e

def getContent(n,name,p,top_id):

try:
    global counter
    curr.execute("select id from rooms where name=%s",(name)) #先保證名字絕不相同
    y= curr.fetchone()
    print "exist?? ",y,"n=",n
    if not y:
        url="https://www.zhihu.com/topic/"+str(n)+"/hot"
        html=get_html(url)
        if html is None:
            return
        soup = BeautifulSoup(html)
        title=soup.find("div", {"id" : "zh-topic-title"}).find("h1").text
        pic_path=soup.find("a",{"id":"zh-avartar-edit-form"}).find("img").get("src")
        description=soup.find("div",{"class":"zm-editable-content"})
        if description is not None:
            description=description.text
             
        if (u"未歸類" in title or u"根話題" in title): #允許入庫，避免死循環
            description=None
             
        tag_path=download_img(pic_path,top_id)
        print "tag_path=",tag_path
        if (tag_path is not None) or tag_path==True:
            if tag_path==True:
                tag_path=None
            father_id=2 #默認為雜談
            curr.execute("select id from rooms where name=%s",(p))
            results = curr.fetchall()
            for r in results:
                father_id=r[0]
            name=title
            curr.execute("select id from rooms where name=%s",(name)) #先保證名字絕不相同
            y= curr.fetchone()
            print "store see..",y
            if not y:
                friends_num=0
                temp = time.time()
                x = time.localtime(float(temp))
                create_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now
                create_time
                creater_id=None
                room_avatar=tag_path
                is_pass=1
                has_index=0
                reason_id=None 
                #print father_id,name,friends_num,create_time,creater_id,room_avatar,is_pass,has_index,reason_id
                ######################有資格入庫的內容
                counter=counter+1
                curr.execute("INSERT INTO rooms(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id))
                conn.commit() #必須時時進入數據庫，不然找不到父節點
                if counter % 200==0:
                    print "current node",name,"num",counter
except Exception as e:
    print "get content error",e

def work():

global queue
curr.execute("select id,node,parent,name from classify where status=1")
results = curr.fetchall()
for r in results:
    top_id=r[0]
    node=r[1]
    parent=r[2]
    name=r[3]
    try:
        queue.put((node,name,parent)) #首先放入隊列
        while queue.qsize() >0:
            n,p=queue.get() #頂節點出隊
            getContent(n,p,top_id)
            getChildren(n,name) #出隊內容的子節點
        conn.commit()
    except Exception as e:
        print "what"s wrong",e

def new_work():

global queue
curr.execute("select id,data_id,name from classify_new_copy where status=1")
results = curr.fetchall()
for r in results:
    top_id=r[0]
    data_id=r[1]
    name=r[2]
    try:
        get_topis(data_id,name,top_id)
    except:
        pass

def get_topis(data_id,name,top_id):

global queue
url = "https://www.zhihu.com/node/TopicsPlazzaListV2"
isGet = True;
offset = -20;
data_id=str(data_id)
while isGet:
    offset = offset + 20
    values = {"method": "next", "params": "{"topic_id":"+data_id+","offset":"+str(offset)+","hash_id":""}"}
    try:
        msg=None
        try:
            data = urllib.urlencode(values)
            request = urllib2.Request(url,data,headers)
            response = urllib2.urlopen(request,None,5)
            html=response.read().decode("utf-8")
            json_str = json.loads(html)
            ms=json_str["msg"]
            if len(ms) <5:
                break
            msg=ms[0]
        except Exception as e:
            print "eeeee",e
        #print msg
        if msg is not None:
            soup = BeautifulSoup(str(msg))
            blks = soup.find_all("div", {"class" : "blk"})
            for blk in blks:
                page=blk.find("a").get("href")
                if page is not None:
                    node=page.replace("/topic/","") #將更多的種子入庫
                    parent=name
                    ne=blk.find("strong").text
                    try:
                        queue.put((node,ne,parent)) #首先放入隊列
                        while queue.qsize() >0:
                            n,name,p=queue.get() #頂節點出隊
                            size=queue.qsize()
                            if size > 0:
                                print size
                            getContent(n,name,p,top_id)
                            getChildren(n,name) #出隊內容的子節點
                        conn.commit()
                    except Exception as e:
                        print "what"s wrong",e  
    except urllib2.URLError, e:
        print "error is",e
        pass

if name == "__main__":

i=0
while i<400:
    new_work()
    i=i+1

當然代碼是十分簡單的，稍微有python基礎都可以搞定，注釋清楚明白，大家安靜討論研究下，獻丑了。

GPU云服務器云服務器某逼乎撞破某榴新手python

文章版權歸作者所有，未經允許請勿轉載,若此文章存在違規行為，您可以聯系管理員刪除。

轉載請注明本文地址：http://specialneedsforspecialkids.com/yun/42793.html

python 腳本撞庫國內“某榴”賬號

摘要：這里只討論技術本生，代碼中某榴的地址也已經改掉，避免被管理員誤解禁言等發生，謝謝大家理解。其實日常生活中我們的用戶名和密碼就那么幾個，所以這給撞庫帶來了可能，本文主要給出python腳本撞庫的一點粗淺代碼。這里只討論技術本生，代碼中某榴的地址也已經改掉，避免被管理員誤解禁言等發生，謝謝大家理解。代碼如下： import sysreload(sys) sys.setdefaulten...

tianlai 2019-07-30 18:38 評論0 收藏0
不使用Ruby的十個理由

摘要：既然這不是宗教，而是關于如何面對新的事物，我認為我們應該列出所有其他人認為不使用來做開發的理由。在下工作的不好這是一定的。流行度只是衡量使用率，社區活躍度的一個指標，用來幫助人們判斷技術的可用性，穩定性和支持程度。不幸的是，人們混淆了和。這是一篇贊美 Ruby 的文章！！！看完再噴不遲? 請注意：這是一篇主觀意識的文章。它的目的并不是要說服你使用或者不使用Ruby，或者其他任何技術。這...

young.li 2019-03-18 19:33 評論0 收藏0
UCloud用戶社區UClub新手使用指南

摘要：用戶社區的使用用戶社區為實名制社區，在提問回復或發文評論前必須綁定手機號才能夠正常發帖。官方有權對灌水違法違規不文明內容進行刪除。親愛的小伙伴你好！首先感謝你來到UCoud用戶社區，期待你的加入！UClub用戶社區旨在為UCloud用戶及廣大云計算愛好者提供一個開放的學習交流平臺。為了你能更好地使用UCloud用戶社區，請你花費3分鐘仔細閱讀，閱讀完成后將獲得10積分。UCloud用戶社區的...

Tecode 2022-06-28 18:59 評論0 收藏0
[新手向視頻]新版PyCharm創建項目為什么會有問題

摘要：而在年的新版本中，對新建項目的配置增加了一點小功能。點擊就是新建一個項目。在創建同時，還需要指定項目所使用的環境。但對于新手來說，就會發生，在命令行里通過安裝的庫，無法在自己創建的項目中使用。 showImg(https://segmentfault.com/img/remote/1460000017038452?w=600&h=338); https://www.zhihu.com...

acrazing 2019-07-31 11:20 評論0 收藏0
8步從Python白板到專家，從基礎到深度學習

摘要：去吧，參加一個在上正在舉辦的實時比賽吧試試你所學到的全部知識微軟雅黑深度學習終于看到這個，興奮吧現在，你已經學到了絕大多數關于機器學習的技術，是時候試試深度學習了。微軟雅黑對于深度學習，我也是個新手，就請把這些建議當作參考吧。如果你想做一個數據科學家，或者作為一個數據科學家你想擴展自己的工具和知識庫，那么，你來對地方了。這篇文章的目的，是給剛開始使用Python進行數據分析的人，指明一條全...

Zachary 2019-04-25 18:00 評論0 收藏0