Python语言测试代码_爬虫（动态）代理测试案例

Python 3.X 爬虫（动态）代理IP Python 2.X 爬虫（动态）代理IP Python 3.X Socks5代理IP Python 3.X 动态转发代理IP Scrapy

Python 3.X 代码调用示例·爬虫（动态）代理IP 下载DEMO项目


# -*- coding: UTF-8 -*-

'''
Python 3.x
无忧代理IP Created on 2018年05月11日
描述：本DEMO演示了使用爬虫（动态）代理IP请求网页的过程，代码使用了多线程
逻辑：每隔5秒从API接口获取IP，对于每一个IP开启一个线程去抓取网页源码
@author: www.data5u.com
'''
import requests;
import time;
import threading;
import urllib3;

ips = [];

# 爬数据的线程类
class CrawlThread(threading.Thread):
    def __init__(self,proxyip):
        super(CrawlThread, self).__init__();
        self.proxyip=proxyip;
    def run(self):
        # 开始计时
        start = time.time();
        
        # 请求头
        headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
            'Connection': 'close'
        }
        
        #消除关闭证书验证的警告
        urllib3.disable_warnings();
        #使用代理IP请求网址，注意第三个参数verify=False意思是跳过SSL验证（可以防止报SSL错误）
        html=requests.get(url=targetUrl, proxies={"http" : 'http://' + self.proxyip, "https" : 'https://' + self.proxyip}, verify=False, timeout=15, headers=headers).content.decode()
        # 结束计时
        end = time.time();
        # 输出内容
        print(threading.current_thread().getName() +  "使用代理IP, 耗时 " + str(end - start) + "毫秒 " + self.proxyip + " 获取到如下HTML内容：\n" + html + "\n*************")

# 获取代理IP的线程类
class GetIpThread(threading.Thread):
    def __init__(self,fetchSecond):
        super(GetIpThread, self).__init__();
        self.fetchSecond=fetchSecond;
    def run(self):
        global ips;
        while True:
            # 获取IP列表
            res = requests.get(apiUrl).content.decode()
            # 按照\n分割获取到的IP
            ips = res.split('\n');
            # 利用每一个IP
            for proxyip in ips:
                if proxyip.strip()=='' :
                    continue
                print(proxyip)
                # 开启一个线程
                CrawlThread(proxyip).start();
            # 休眠
            time.sleep(self.fetchSecond);

if __name__ == '__main__':
    # 这里填写无忧代理IP提供的API订单号（请到用户中心获取）
    order = "请把这里替换为您的IP提取码";
    # 获取IP的API接口
    apiUrl = "http://api.ip.data5u.com/dynamic/get.html?order=" + order;
    # 要抓取的目标网站地址
    targetUrl = "http://pv.sohu.com/cityjson?ie=utf-8";
    # 获取IP时间间隔，建议为5秒
    fetchSecond = 5;
    # 开始自动获取IP
    GetIpThread(fetchSecond).start();

Python 2.X 代码调用示例·爬虫（动态）代理IP


# -*- coding: utf8 -*-

'''
Python 2.X
无忧代理IP Created on 2017年08月21日
描述：本DEMO演示了使用爬虫（动态）代理IP请求网页的过程，代码使用了多线程
逻辑：每隔5秒从API接口获取IP，对于每一个IP开启一个线程去抓取网页源码
@author: www.data5u.com
'''
import time
import threading
import requests
import sys

# 解决编码报错问题
reload(sys)
sys.setdefaultencoding('utf8')

ips = [];

# 爬数据的线程类
class CrawlThread(threading.Thread):
    def __init__(self,proxyip):
        super(CrawlThread, self).__init__();
        self.proxyip=proxyip;
    def run(self):
        # 开始计时
        start = time.time()

        # 请求头
        headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
            'Connection': 'close'
        }
        
        #使用代理IP请求网址，注意第三个参数verify=False意思是跳过SSL验证（可以防止报SSL错误）
        html=requests.get(
            url=targetUrl, 
            proxies={"http" : "http://" + self.proxyip, "https" : "https://" + self.proxyip}, 
            verify=False, 
            timeout=15,
            headers=headers
        ).content.decode()
        
        # 结束计时
        end = time.time()
        # 输出内容
        print ( threading.current_thread().getName() +  "耗时 " + str(end - start) + "毫秒 " + self.proxyip + " 获取到如下HTML内容：\n" + html + "\n*************" )

# 获取代理IP的线程类
class GetIpThread(threading.Thread):
    def __init__(self,fetchSecond):
        super(GetIpThread, self).__init__();
        self.fetchSecond=fetchSecond;
    def run(self):
        global ips;
        while True:
            # 获取IP列表
            res = urllib.urlopen(apiUrl).read().strip("\n");
            # 按照\n分割获取到的IP
            ips = res.split("\n");
            # 利用每一个IP
            for proxyip in ips:
                if proxyip.strip():
                    # 开启一个线程
                    CrawlThread(proxyip).start();
            # 休眠
            time.sleep(self.fetchSecond);

if __name__ == '__main__':
    # 这里填写无忧代理IP提供的API订单号（请到用户中心获取）
    order = "please-input-your-order-here";
    # 获取IP的API接口
    apiUrl = "http://api.ip.data5u.com/dynamic/get.html?order=" + order;
    # 要抓取的目标网站地址
    targetUrl = "http://pv.sohu.com/cityjson?ie=utf-8";
    # 获取IP时间间隔，建议为5秒
    fetchSecond = 5;
    # 开始自动获取IP
    GetIpThread(fetchSecond).start();

Python 3.X 代码调用示例·Socks5代理IP


# -*- coding: UTF-8 -*-

'''
Python 3.x：参考 https://www.programcreek.com/python/example/71719/socks.SOCKS5
无忧代理IP Created on 2018年05月11日
描述：本DEMO演示了使用爬虫（动态）代理IP请求网页的过程，代码使用了多线程
逻辑：每隔5秒从API接口获取IP，对于每一个IP开启一个线程去抓取网页源码
注意：需先安装socks模块pip install pysocks
linux测试命令：curl myip.ipip.net --socks5 data5u:s9BBv.@61.186.64.96:49003
@author: www.data5u.com
'''
import requests;
import time;
import threading;
import socks;
from sockshandler import SocksiPyHandler;
from urllib.request import build_opener;
import socket;

# 爬数据的线程类
class CrawlThread(threading.Thread):
    def __init__(self,proxyip):
        super(CrawlThread, self).__init__();
        self.proxyip=proxyip;
    def run(self):
        # 开始计时
        start = time.time();
        
        ipinfos = self.proxyip.split(',');
        
        print(ipinfos[1])
        print(ipinfos[2])
        print((ipinfos[0].split(':'))[0])
        print(int((ipinfos[0].split(':'))[1]))
  
        # socks.set_default_proxy(proxy_type=socks.SOCKS5, addr=(ipinfos[0].split(':'))[0], port=int((ipinfos[0].split(':'))[1]), rdns=True, username=ipinfos[1], password=ipinfos[2])
        # socket.socket = socks.socksocket
  
        headers = {
            'Accept': 'text/html, application/xhtml+xml, image/jxr, */*',
            'Accept - Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-Hans-CN, zh-Hans; q=0.5',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
        }       
       
        proxy_handler = SocksiPyHandler(proxytype=socks.SOCKS5, proxyaddr=(ipinfos[0].split(':'))[0], proxyport=int((ipinfos[0].split(':'))[1]), username=ipinfos[1], password=ipinfos[2])
        opener = build_opener(proxy_handler)
        opener.addheaders = [(k, v) for k, v in headers.items()]
        resp = opener.open(targetUrl, timeout=30)

        html = resp.read().decode();
       
        # 结束计时
        end = time.time();
       
        # 输出内容
        print(threading.current_thread().getName() +  "使用代理IP, 耗时 " + str(end - start) + "毫秒 " + self.proxyip + " 获取到如下HTML内容：\n" + html + "\n*************")

# 获取代理IP的线程类
class GetIpThread(threading.Thread):
    def __init__(self,fetchSecond):
        super(GetIpThread, self).__init__();
        self.fetchSecond=fetchSecond;
    def run(self):
        while True:
            # 获取IP列表
            res = requests.get(apiUrl).content.decode();
       
            # 开启一个线程
            CrawlThread(res.strip()).start();
                
            # 休眠
            time.sleep(self.fetchSecond);

if __name__ == '__main__':
    # 这里填写无忧代理IP提供的API订单号（请到用户中心获取）
    order = "这里改成你的IP提取码";
    # 获取IP的API接口，返回socks5代理的格式为【IP:端口,用户名,密码】
    apiUrl = "http://api.ip.data5u.com/socks/get.html?type=0&sep=3&order=" + order;
    # 要抓取的目标网站地址
    targetUrl = "http://pv.sohu.com/cityjson?ie=utf-8";
    # 获取IP时间间隔，建议为5秒
    fetchSecond = 21;
    # 开始自动获取IP
    GetIpThread(fetchSecond).start();

Python 3.X 代码调用示例·动态转发代理IP


# -*- coding: UTF-8 -*-

'''
Python 3.x
无忧代理IP Created on 2018年05月11日
描述：本DEMO演示了使用【动态转发】代理请求网页的过程，代码使用了多线程
@author: www.data5u.com
'''
import requests;
import time;
import threading;
import urllib3;

ips = [];

# 爬数据的线程类
class CrawlThread(threading.Thread):
    
    def __init__(self,proxyip):
        super(CrawlThread, self).__init__();
        self.proxyip=proxyip;
        
    def run(self):
        # 开始计时
        start = time.time();
        #消除关闭证书验证的警告
        urllib3.disable_warnings();

        # 请求头
        headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
            'Connection': 'close'
        }
        
        #使用代理IP请求网址，注意第三个参数verify=False意思是跳过SSL验证（可以防止报SSL错误）
        html=requests.get(
            url=targetUrl, 
            proxies={"http" : "http://" + self.proxyip, "https" : "https://" + self.proxyip}, 
            verify=False, 
            timeout=15,
            headers=headers
        ).content.decode()
        
        # 结束计时
        end = time.time();
        # 输出内容
        print(threading.current_thread().getName() +  "耗时 " + str(end - start) + "毫秒 " + self.proxyip + " 获取到如下HTML内容：\n" + html + "\n*************")

if __name__ == '__main__':
    
    # 动态转发代理IP
    proxyIp="tunnel.data5u.com:56789";
    proxyUsername="【这里替换成你的IP提取码】";
    proxyPwd="【这里替换成你的动态转发密码】";
    
    # 要抓取的目标网站地址
    targetUrl = "http://myip.ipip.net/"; #https://pv.sohu.com/cityjson?ie=utf-8
    
    # 开启线程数量
    threadNum = 5;
    
    for reqNo in range(threadNum):
        # 开始自动获取IP
        CrawlThread(proxyUsername + ":" + proxyPwd + "@" + proxyIp).start();

新手上路

支付方式

常见问题

关于我们