摘要:上一篇寫了個有意思的文章腳本撞庫國內某榴賬號很多朋友反映,該榴賬號有驗證,即時撞破賬號也無卵用,其實新手號還是可以使用的,至于撞庫破解某榴賬號的問題請移到上篇帖子查看。華麗分割線這次再來研究下如何搞定某逼乎的話題問題。
上一篇寫了個有意思的文章:python 腳本撞庫國內“某榴”賬號 https://www.52pojie.cn/thread...
很多朋友反映,該榴賬號有google驗證,即時撞破賬號也無卵用,其實新手號還是可以使用的,至于撞庫破解“某榴”賬號的問題請移到上篇帖子查看。
這次再來研究下如何搞定“某逼乎”的話題問題。
逼乎現在在整個社區類網站中可以說火的不要不要的,逼乎上的內容質量在所有社區中還是相對較高的,很多時候我們都需要爬取逼乎精彩的話題,當然這不是為了裝,搞不好你的設計恰好
就需要這么一個需求。
程序猿之間上代碼,一起研究下:
"""
@author:haoning
[url=home.php?mod=space&uid=365491]@create[/url] time:2015.8.5
"""
from future import division # 精確除法
from Queue import Queue
from builtin import False
import json
import os
import re
import platform
import uuid
import urllib
import urllib2
import sys
import time
import MySQLdb as mdb
from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding( "utf-8" )
headers = {
"User-Agent" : "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0",
"Content-Type":"application/x-www-form-urlencoded; charset=UTF-8",
"X-Requested-With":"XMLHttpRequest",
"Referer":"https://www.zhihu.com/topics",
"Cookie":"__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a"
}
DB_HOST = "127.0.0.1"
DB_USER = "root"
DB_PASS = "root"
queue= Queue() #接收隊列
nodeSet=set()
keywordSet=set()
stop=0
offset=-20
level=0
maxLevel=7
counter=0
base=""
conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, "zhihu", charset="utf8")
conn.autocommit(False)
curr = conn.cursor()
def get_html(url):
try: req = urllib2.Request(url) response = urllib2.urlopen(req,None,3) #在這里應該加入代{過}{濾}理 html = response.read() return html except: pass return None
def getTopics():
url = "https://www.zhihu.com/topics" print url try: req = urllib2.Request(url) response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞� html = response.read().decode("utf-8") print html soup = BeautifulSoup(html) lis = soup.find_all("li", {"class" : "zm-topic-cat-item"}) for li in lis: data_id=li.get("data-id") name=li.text curr.execute("select id from classify_new where name=%s",(name)) y= curr.fetchone() if not y: curr.execute("INSERT INTO classify_new(data_id,name)VALUES(%s,%s)",(data_id,name)) conn.commit() except Exception as e: print "get topic error",e
def get_extension(name):
where=name.rfind(".") if where!=-1: return name[where:len(name)] return None
def which_platform():
sys_str = platform.system() return sys_str
def GetDateString():
when=time.strftime("%Y-%m-%d",time.localtime(time.time())) foldername = str(when) return foldername
def makeDateFolder(par,classify):
try: if os.path.isdir(par): newFolderName=par + "http://" + GetDateString() + "http://" +str(classify) if which_platform()=="Linux": newFolderName=par + "/" + GetDateString() + "/" +str(classify) if not os.path.isdir( newFolderName ): os.makedirs( newFolderName ) return newFolderName else: return None except Exception,e: print "kk",e return None
def download_img(url,classify):
try: extention=get_extension(url) if(extention is None): return None req = urllib2.Request(url) resp = urllib2.urlopen(req,None,3) dataimg=resp.read() name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention top="E://topic_pic" folder=makeDateFolder(top, classify) filename=None if folder is not None: filename =folder+"http://"+name try: if "e82bab09c_m" in str(url): return True if not os.path.exists(filename): file_object = open(filename,"w+b") file_object.write(dataimg) file_object.close() return "/room/default/"+GetDateString()+"/"+str(classify)+"/"+name else: print "file exist" return None except IOError,e1: print "e1=",e1 pass except Exception as e: print "eee",e pass return None #如果沒有下載下來就利用原來網站的鏈接
def getChildren(node,name):
global queue,nodeSet try: url="https://www.zhihu.com/topic/"+str(node)+"/hot" html=get_html(url) if html is None: return soup = BeautifulSoup(html) p_ch="父話題" node_name=soup.find("div", {"id" : "zh-topic-title"}).find("h1").text topic_cla=soup.find("div", {"class" : "child-topic"}) if topic_cla is not None: try: p_ch=str(topic_cla.text) aList = soup.find_all("a", {"class" : "zm-item-tag"}) #獲取所有子節點 if u"子話題" in p_ch: for a in aList: token=a.get("data-token") a=str(a).replace(" ","").replace(" ","").replace(" ","") start=str(a).find(">") end=str(a).rfind("") new_node=str(str(a)[start+1:end]) curr.execute("select id from rooms where name=%s",(new_node)) #先保證名字絕不相同 y= curr.fetchone() if not y: print "y=",y,"new_node=",new_node,"token=",token queue.put((token,new_node,node_name)) except Exception as e: print "add queue error",e except Exception as e: print "get html error",e
def getContent(n,name,p,top_id):
try: global counter curr.execute("select id from rooms where name=%s",(name)) #先保證名字絕不相同 y= curr.fetchone() print "exist?? ",y,"n=",n if not y: url="https://www.zhihu.com/topic/"+str(n)+"/hot" html=get_html(url) if html is None: return soup = BeautifulSoup(html) title=soup.find("div", {"id" : "zh-topic-title"}).find("h1").text pic_path=soup.find("a",{"id":"zh-avartar-edit-form"}).find("img").get("src") description=soup.find("div",{"class":"zm-editable-content"}) if description is not None: description=description.text if (u"未歸類" in title or u"根話題" in title): #允許入庫,避免死循環 description=None tag_path=download_img(pic_path,top_id) print "tag_path=",tag_path if (tag_path is not None) or tag_path==True: if tag_path==True: tag_path=None father_id=2 #默認為雜談 curr.execute("select id from rooms where name=%s",(p)) results = curr.fetchall() for r in results: father_id=r[0] name=title curr.execute("select id from rooms where name=%s",(name)) #先保證名字絕不相同 y= curr.fetchone() print "store see..",y if not y: friends_num=0 temp = time.time() x = time.localtime(float(temp)) create_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now create_time creater_id=None room_avatar=tag_path is_pass=1 has_index=0 reason_id=None #print father_id,name,friends_num,create_time,creater_id,room_avatar,is_pass,has_index,reason_id ######################有資格入庫的內容 counter=counter+1 curr.execute("INSERT INTO rooms(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)) conn.commit() #必須時時進入數據庫,不然找不到父節點 if counter % 200==0: print "current node",name,"num",counter except Exception as e: print "get content error",e
def work():
global queue curr.execute("select id,node,parent,name from classify where status=1") results = curr.fetchall() for r in results: top_id=r[0] node=r[1] parent=r[2] name=r[3] try: queue.put((node,name,parent)) #首先放入隊列 while queue.qsize() >0: n,p=queue.get() #頂節點出隊 getContent(n,p,top_id) getChildren(n,name) #出隊內容的子節點 conn.commit() except Exception as e: print "what"s wrong",e
def new_work():
global queue curr.execute("select id,data_id,name from classify_new_copy where status=1") results = curr.fetchall() for r in results: top_id=r[0] data_id=r[1] name=r[2] try: get_topis(data_id,name,top_id) except: pass
def get_topis(data_id,name,top_id):
global queue url = "https://www.zhihu.com/node/TopicsPlazzaListV2" isGet = True; offset = -20; data_id=str(data_id) while isGet: offset = offset + 20 values = {"method": "next", "params": "{"topic_id":"+data_id+","offset":"+str(offset)+","hash_id":""}"} try: msg=None try: data = urllib.urlencode(values) request = urllib2.Request(url,data,headers) response = urllib2.urlopen(request,None,5) html=response.read().decode("utf-8") json_str = json.loads(html) ms=json_str["msg"] if len(ms) <5: break msg=ms[0] except Exception as e: print "eeeee",e #print msg if msg is not None: soup = BeautifulSoup(str(msg)) blks = soup.find_all("div", {"class" : "blk"}) for blk in blks: page=blk.find("a").get("href") if page is not None: node=page.replace("/topic/","") #將更多的種子入庫 parent=name ne=blk.find("strong").text try: queue.put((node,ne,parent)) #首先放入隊列 while queue.qsize() >0: n,name,p=queue.get() #頂節點出隊 size=queue.qsize() if size > 0: print size getContent(n,name,p,top_id) getChildren(n,name) #出隊內容的子節點 conn.commit() except Exception as e: print "what"s wrong",e except urllib2.URLError, e: print "error is",e pass
if name == "__main__":
i=0 while i<400: new_work() i=i+1
當然代碼是十分簡單的,稍微有python基礎都可以搞定,注釋清楚明白,大家安靜討論研究下,獻丑了。
文章版權歸作者所有,未經允許請勿轉載,若此文章存在違規行為,您可以聯系管理員刪除。
轉載請注明本文地址:http://specialneedsforspecialkids.com/yun/42793.html
摘要:這里只討論技術本生,代碼中某榴的地址也已經改掉,避免被管理員誤解禁言等發生,謝謝大家理解。 其實日常生活中我們的用戶名和密碼就那么幾個,所以這給撞庫帶來了可能,本文主要給出python腳本撞庫的一點粗淺代碼。這里只討論技術本生,代碼中某榴的地址也已經改掉,避免被管理員誤解禁言等發生,謝謝大家理解。 代碼如下: import sysreload(sys) sys.setdefaulten...
摘要:既然這不是宗教,而是關于如何面對新的事物,我認為我們應該列出所有其他人認為不使用來做開發的理由。在下工作的不好這是一定的。流行度只是衡量使用率,社區活躍度的一個指標,用來幫助人們判斷技術的可用性,穩定性和支持程度。不幸的是,人們混淆了和。這是一篇贊美 Ruby 的文章!!!看完再噴不遲? 請注意:這是一篇主觀意識的文章。它的目的并不是要說服你使用或者不使用Ruby,或者其他任何技術。這...
摘要:用戶社區的使用用戶社區為實名制社區,在提問回復或發文評論前必須綁定手機號才能夠正常發帖。官方有權對灌水違法違規不文明內容進行刪除。親愛的小伙伴你好!首先感謝你來到UCoud用戶社區,期待你的加入!UClub用戶社區旨在為UCloud用戶及廣大云計算愛好者提供一個開放的學習交流平臺。為了你能更好地使用UCloud用戶社區,請你花費3分鐘仔細閱讀,閱讀完成后將獲得10積分。UCloud用戶社區的...
摘要:而在年的新版本中,對新建項目的配置增加了一點小功能。點擊就是新建一個項目。在創建同時,還需要指定項目所使用的環境。但對于新手來說,就會發生,在命令行里通過安裝的庫,無法在自己創建的項目中使用。 showImg(https://segmentfault.com/img/remote/1460000017038452?w=600&h=338); https://www.zhihu.com...
摘要:去吧,參加一個在上正在舉辦的實時比賽吧試試你所學到的全部知識微軟雅黑深度學習終于看到這個,興奮吧現在,你已經學到了絕大多數關于機器學習的技術,是時候試試深度學習了。微軟雅黑對于深度學習,我也是個新手,就請把這些建議當作參考吧。 如果你想做一個數據科學家,或者作為一個數據科學家你想擴展自己的工具和知識庫,那么,你來對地方了。這篇文章的目的,是給剛開始使用Python進行數據分析的人,指明一條全...
閱讀 2294·2021-09-22 15:27
閱讀 3166·2021-09-03 10:32
閱讀 3491·2021-09-01 11:38
閱讀 2493·2019-08-30 15:56
閱讀 2206·2019-08-30 13:01
閱讀 1531·2019-08-29 12:13
閱讀 1410·2019-08-26 13:33
閱讀 885·2019-08-26 13:30