Scrapy Proxy Configuration

Scrapy is the most powerful Python web crawling and scraping framework. With built-in support for concurrent requests, automatic throttling, data pipelines, and extensible middleware, Scrapy is the go-to choice for large-scale data extraction.

Scrapy has built-in proxy support through request.meta["proxy"], making it easy to route crawl traffic through ProxyMesh rotating proxies. This guide shows you how to configure proxies for reliable crawling without IP bans.

Why Use Proxies with Scrapy?

Avoid IP bans - Rotate through different IPs automatically
Bypass rate limits - Distribute requests across many addresses
Geographic targeting - Access region-specific content
Scale crawling - Crawl more pages without getting blocked

Installation

Install Scrapy using pip:

pip install scrapy

Scrapy has built-in proxy support—no additional packages needed to get started.

Basic Proxy Configuration

Set proxies per-request using request.meta["proxy"]:

import scrapy

class BasicProxySpider(scrapy.Spider):
    name = "basic_proxy"
    
    def start_requests(self):
        yield scrapy.Request(
            url="https://api.ipify.org?format=json",
            meta={
                "proxy": "http://username:password@proxyhost:31280"
            },
            callback=self.parse
        )
    
    def parse(self, response):
        data = response.json()
        self.logger.info(f"Request through IP: {data['ip']}")
        yield {"ip": data["ip"]}

Run with: scrapy runspider spider.py

Proxy Middleware for All Requests

Apply proxy to all requests with middleware:

# middlewares.py
class ProxyMiddleware:
    def process_request(self, request, spider):
        request.meta["proxy"] = "http://username:password@proxyhost:31280"

# settings.py
DOWNLOADER_MIDDLEWARES = {
    "myproject.middlewares.ProxyMiddleware": 350,
}

Rotating Proxy Middleware

Rotate between multiple proxy locations:

# middlewares.py
import random

class RotatingProxyMiddleware:

    PROXIES = [

        "http://username:password@proxyhost1:31280",

        "http://username:password@proxyhost2:31280",

        "http://username:password@proxyhost3:31280",

        "http://username:password@proxyhost4:31280",

    ]
    
    def process_request(self, request, spider):
        request.meta["proxy"] = random.choice(self.PROXIES)

IP Authentication

With your IP whitelisted in ProxyMesh, no credentials needed:

yield scrapy.Request(
    url="https://example.com",
    meta={"proxy": "http://proxyhost:port"}
)

Custom Proxy Headers

For basic proxy usage, Scrapy works out of the box. For advanced control with custom proxy headers over HTTPS, use the scrapy-proxy-headers extension:

pip install scrapy-proxy-headers

# settings.py
DOWNLOAD_HANDLERS = {
    "https": "scrapy_proxy_headers.HTTP11ProxyDownloadHandler"
}

# spider.py
import scrapy

class ProxyHeadersSpider(scrapy.Spider):
    name = "proxy_headers"
    
    def start_requests(self):
        yield scrapy.Request(
            url="https://api.ipify.org?format=json",
            meta={
                "proxy": "http://username:password@proxyhost:31280",
                "proxy_headers": {"X-ProxyMesh-Country": "US"}
            },
            callback=self.parse
        )
    
    def parse(self, response):
        # Access proxy response header
        proxy_ip = response.headers.get(b"X-ProxyMesh-IP")
        if proxy_ip:
            self.logger.info(f"Routed through: {proxy_ip.decode()}")

See the scrapy-proxy-headers documentation for more details.

Common Use Cases

E-commerce Product Scraping

import scrapy

class ProductSpider(scrapy.Spider):
    name = "products"
    start_urls = ["https://shop.example.com/products"]
    
    custom_settings = {
        "CONCURRENT_REQUESTS": 16,
        "DOWNLOAD_DELAY": 0.5,
    }
    
    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url,
                meta={"proxy": "http://username:password@proxyhost:31280"}
            )
    
    def parse(self, response):
        for product in response.css("div.product"):
            yield {
                "name": product.css("h2::text").get(),
                "price": product.css("span.price::text").get(),
                "url": response.urljoin(product.css("a::attr(href)").get())
            }
        
        # Follow pagination
        next_page = response.css("a.next::attr(href)").get()
        if next_page:
            yield scrapy.Request(
                response.urljoin(next_page),
                meta={"proxy": "http://username:password@proxyhost:31280"},
                callback=self.parse
            )

Multiple authorized proxies

import scrapy

class GeoSpider(scrapy.Spider):
    name = "geo"
    

    PROXY_BY_KEY = {

        "proxyhost1": "proxyhost1:31280",

        "proxyhost2": "proxyhost2:31280",

        "proxyhost3": "proxyhost3:31280",

        "proxyhost4": "proxyhost4:31280",

    }
    
    def start_requests(self):
        for proxy_key, proxy_host in self.PROXY_BY_KEY.items():
            yield scrapy.Request(
                url="https://example.com/localized-content",
                meta={
                    "proxy": f"http://user:pass@{proxy_host}",
                    "proxy_key": proxy_key,
                },
                callback=self.parse_proxy
            )
    
    def parse_proxy(self, response):
        yield {
            "proxy_key": response.meta["proxy_key"],
            "content": response.css("div.content::text").get()
        }

Retry with Different Proxy on Failure

# middlewares.py
import random

class RetryProxyMiddleware:

    PROXIES = [

        "http://username:password@proxyhost1:31280",

        "http://username:password@proxyhost2:31280",

        "http://username:password@proxyhost3:31280",

        "http://username:password@proxyhost4:31280",

    ]
    
    def process_response(self, request, response, spider):
        if response.status in [403, 429, 503]:
            # Retry with different proxy
            new_proxy = random.choice(self.PROXIES)
            request.meta["proxy"] = new_proxy
            return request.copy()
        return response
    
    def process_exception(self, request, exception, spider):
        # Retry with different proxy on connection error
        new_proxy = random.choice(self.PROXIES)
        request.meta["proxy"] = new_proxy
        return request.copy()

Complete Production Spider

import scrapy

class ProductionSpider(scrapy.Spider):
    name = "production"
    
    custom_settings = {
        "CONCURRENT_REQUESTS": 8,
        "DOWNLOAD_DELAY": 1,
        "RETRY_TIMES": 3,
        "RETRY_HTTP_CODES": [500, 502, 503, 504, 408, 429],
    }
    
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.proxy = "http://username:password@proxyhost:31280"
    
    def start_requests(self):
        urls = [f"https://example.com/page/{i}" for i in range(1, 101)]
        for url in urls:
            yield scrapy.Request(
                url,
                meta={"proxy": self.proxy},
                callback=self.parse,
                errback=self.handle_error
            )
    
    def parse(self, response):
        if response.status == 200:
            yield {
                "url": response.url,
                "title": response.css("title::text").get(),
            }
    
    def handle_error(self, failure):
        self.logger.error(f"Failed: {failure.request.url}")

ProxyMesh Headers Reference

Send these headers to control proxy behavior:

X-ProxyMesh-Country - Route through a specific country (e.g., "US"). Only works with world proxy or open proxy
X-ProxyMesh-IP - Request a specific outgoing IP address
X-ProxyMesh-Not-IP - Exclude specific IPs from rotation

The proxy returns X-ProxyMesh-IP in the response with the IP address used.

Resources

Related Python Proxy Guides

Explore proxy configuration for other Python HTTP libraries:

aiohttp - High-performance async HTTP client
Requests - Simple Python HTTP for scripts
CloudScraper - Bypass Cloudflare protection

Start Free Trial