基于Django+bootstrap3開發的簡易網站,網站搭建采用nginx1.8+fastCGI(flup)+python。 演示網站http://www.itjujiao.com
分詞處理,目前分詞搜索結果不是很理想,有大神可以指點下思路。比如我檢索“功夫熊貓之卷軸的秘密”,一個結果都沒有。而檢索“功夫熊貓“有結果集(功丶夫熊貓⒊英語中英字幕.mp4,功丶夫熊貓2.Kung.Fu.Panda.2.2011.BDrip.720P.國粵英臺四語.特效中英字幕.mp4,功丶夫熊貓3(韓版)2016.高清中字.mkv等)或搜索”卷軸的秘密“有結果集([美國]功夫潘達之卷軸的秘密.2016.1080p.mp4, g夫熊貓之卷軸的秘密.HD1280超清中英雙字.mp4等)
#coding: utf8 import re import urllib2 import time from Queue import Queue import threading, errno, datetime import json import requests import MySQLdb as mdb DB_HOST = "" DB_USER = "root" DB_PASS = "" re_start = re.compile(r"start=(d+)") re_uid = re.compile(r"query_uk=(d+)") re_pptt = re.compile(r"&pptt=(d+)") re_urlid = re.compile(r"&urlid=(d+)") ONEPAGE = 20 ONESHAREPAGE = 20 URL_SHARE = "http://yun.baidu.com/pcloud/feed/getsharelist?auth_type=1&start={start}&limit=20&query_uk={uk}&urlid={id}" URL_FOLLOW = "http://yun.baidu.com/pcloud/friend/getfollowlist?query_uk={uk}&limit=20&start={start}&urlid={id}" URL_FANS = "http://yun.baidu.com/pcloud/friend/getfanslist?query_uk={uk}&limit=20&start={start}&urlid={id}" QNUM = 1000 hc_q = Queue(20) hc_r = Queue(QNUM) success = 0 failed = 0 PROXY_LIST = [[0, 10, "", 809, "", "", 0], [5, 0, "", 81, "", "", 0], ] def req_worker(inx): s = requests.Session() while True: req_item = hc_q.get() req_type = req_item[0] url = req_item[1] r = s.get(url) hc_r.put((r.text, url)) print "req_worker#", inx, url def response_worker(): dbconn = mdb.connect(DB_HOST, DB_USER, DB_PASS, "baiduyun", charset="utf8") dbcurr = dbconn.cursor() dbcurr.execute("SET NAMES utf8") dbcurr.execute("set global wait_timeout=60000") while True: metadata, effective_url = hc_r.get() #print "response_worker:", effective_url try: tnow = int(time.time()) id = re_urlid.findall(effective_url)[0] start = re_start.findall(effective_url)[0] if True: if "getfollowlist" in effective_url: #type = 1 follows = json.loads(metadata) uid = re_uid.findall(effective_url)[0] if "total_count" in follows.keys() and follows["total_count"]>0 and str(start) == "0": for i in range((follows["total_count"]-1)/ONEPAGE): try: dbcurr.execute("INSERT INTO urlids(uk, start, limited, type, status) VALUES(%s, %s, %s, 1, 0)" % (uid, str(ONEPAGE*(i+1)), str(ONEPAGE))) except Exception as ex: print "E1", str(ex) pass if "follow_list" in follows.keys(): for item in follows["follow_list"]: try: dbcurr.execute("INSERT INTO user(userid, username, files, status, downloaded, lastaccess) VALUES(%s, "%s", 0, 0, 0, %s)" % (item["follow_uk"], item["follow_uname"], str(tnow))) except Exception as ex: print "E13", str(ex) pass else: print "delete 1", uid, start dbcurr.execute("delete from urlids where uk=%s and type=1 and start>%s" % (uid, start)) elif "getfanslist" in effective_url: #type = 2 fans = json.loads(metadata) uid = re_uid.findall(effective_url)[0] if "total_count" in fans.keys() and fans["total_count"]>0 and str(start) == "0": for i in range((fans["total_count"]-1)/ONEPAGE): try: dbcurr.execute("INSERT INTO urlids(uk, start, limited, type, status) VALUES(%s, %s, %s, 2, 0)" % (uid, str(ONEPAGE*(i+1)), str(ONEPAGE))) except Exception as ex: print "E2", str(ex) pass if "fans_list" in fans.keys(): for item in fans["fans_list"]: try: dbcurr.execute("INSERT INTO user(userid, username, files, status, downloaded, lastaccess) VALUES(%s, "%s", 0, 0, 0, %s)" % (item["fans_uk"], item["fans_uname"], str(tnow))) except Exception as ex: print "E23", str(ex) pass else: print "delete 2", uid, start dbcurr.execute("delete from urlids where uk=%s and type=2 and start>%s" % (uid, start)) else: shares = json.loads(metadata) uid = re_uid.findall(effective_url)[0] if "total_count" in shares.keys() and shares["total_count"]>0 and str(start) == "0": for i in range((shares["total_count"]-1)/ONESHAREPAGE): try: dbcurr.execute("INSERT INTO urlids(uk, start, limited, type, status) VALUES(%s, %s, %s, 0, 0)" % (uid, str(ONESHAREPAGE*(i+1)), str(ONESHAREPAGE))) except Exception as ex: print "E3", str(ex) pass if "records" in shares.keys(): for item in shares["records"]: try: dbcurr.execute("INSERT INTO share(userid, filename, shareid, status) VALUES(%s, "%s", %s, 0)" % (uid, item["title"], item["shareid"])) except Exception as ex: #print "E33", str(ex), item pass else: print "delete 0", uid, start dbcurr.execute("delete from urlids where uk=%s and type=0 and start>%s" % (uid, str(start))) dbcurr.execute("delete from urlids where id=%s" % (id, )) dbconn.commit() except Exception as ex: print "E5", str(ex), id pid = re_pptt.findall(effective_url) if pid: print "pid>>>", pid ppid = int(pid[0]) PROXY_LIST[ppid][6] -= 1 dbcurr.close() dbconn.close() def worker(): global success, failed dbconn = mdb.connect(DB_HOST, DB_USER, DB_PASS, "baiduyun", charset="utf8") dbcurr = dbconn.cursor() dbcurr.execute("SET NAMES utf8") dbcurr.execute("set global wait_timeout=60000") while True: #dbcurr.execute("select * from urlids where status=0 order by type limit 1") dbcurr.execute("select * from urlids where status=0 and type>0 limit 1") d = dbcurr.fetchall() #print d if d: id = d[0][0] uk = d[0][1] start = d[0][2] limit = d[0][3] type = d[0][4] dbcurr.execute("update urlids set status=1 where id=%s" % (str(id),)) url = "" if type == 0: url = URL_SHARE.format(uk=uk, start=start, id=id).encode("utf-8") elif type == 1: url = URL_FOLLOW.format(uk=uk, start=start, id=id).encode("utf-8") elif type == 2: url = URL_FANS.format(uk=uk, start=start, id=id).encode("utf-8") if url: hc_q.put((type, url)) #print "processed", url else: dbcurr.execute("select * from user where status=0 limit 1000") d = dbcurr.fetchall() if d: for item in d: try: dbcurr.execute("insert into urlids(uk, start, limited, type, status) values("%s", 0, %s, 0, 0)" % (item[1], str(ONESHAREPAGE))) dbcurr.execute("insert into urlids(uk, start, limited, type, status) values("%s", 0, %s, 1, 0)" % (item[1], str(ONEPAGE))) dbcurr.execute("insert into urlids(uk, start, limited, type, status) values("%s", 0, %s, 2, 0)" % (item[1], str(ONEPAGE))) dbcurr.execute("update user set status=1 where userid=%s" % (item[1],)) except Exception as ex: print "E6", str(ex) else: time.sleep(1) dbconn.commit() dbcurr.close() dbconn.close() for item in range(16): t = threading.Thread(target = req_worker, args = (item,)) t.setDaemon(True) t.start() s = threading.Thread(target = worker, args = ()) s.setDaemon(True) s.start() response_worker()
摘要:運營商網絡大致可劃分為四朵云公有云平臺云云網絡云。網絡即云,云網一體化將成為未來運營商網絡的最顯著特征。 5月25日消息互聯網+是要讓信息技術、網絡技術深度融合于經濟社會各領域之中,使互聯網下沉為各行各業都能調用的基礎設施資源。預計到2025年,全球將有65億互聯網用戶,使用80億個智能手機,創建1000億個連接,產生176ZB的數據流量,全面實現泛在的連接。在未來,網絡需要滿足海量終端的接...
摘要:今天開源了一個百度云網盤爬蟲項目,地址是。推薦使用命令安裝依賴,最簡單的安裝方式更多安裝的命令可以去上面找。啟動項目使用進行進程管理,運行啟動所有的后臺任務,檢查任務是否正常運行可以用命令,正常運行的應該有個任務。 今天開源了一個百度云網盤爬蟲項目,地址是https://github.com/callmelanmao/yunshare。 百度云分享爬蟲項目 github上有好幾個這樣的...
摘要:年初,金山啟動私有云項目,該項目旨在為向金山提出了私有云網盤存儲需求的政府大型企業以及中型企業提供服務,項目組由金山云楊鋼牽頭組建。中文站對楊鋼進行了專訪,了解其私有云服務的技術組成和業務狀態。 2013年初,金山啟動私有云項目,該項目旨在為向金山提出了私有云網盤/存儲需求的政府、大型企業以及中型企業提供服務,項目組由金山云CTO楊鋼牽頭組建。InfoQ中文站對楊鋼進行了專訪,了解其私有云服...
閱讀 2296·2021-10-09 09:41
閱讀 1750·2019-08-30 15:53
閱讀 992·2019-08-30 15:52
閱讀 3448·2019-08-30 11:26
閱讀 773·2019-08-29 16:09
閱讀 3429·2019-08-29 13:25
閱讀 2264·2019-08-26 16:45
閱讀 1937·2019-08-26 11:51