Scrapy Proxy Configuration

← Back to Python Libraries

Scrapy is the most powerful Python web crawling and scraping framework. With built-in support for concurrent requests, automatic throttling, data pipelines, and extensible middleware, Scrapy is the go-to choice for large-scale data extraction.

Scrapy has built-in proxy support through request.meta["proxy"], making it easy to route crawl traffic through ProxyMesh rotating proxies. This guide shows you how to configure proxies for reliable crawling without IP bans.

Why Use Proxies with Scrapy?

  • Avoid IP bans - Rotate through different IPs automatically
  • Bypass rate limits - Distribute requests across many addresses
  • Geographic targeting - Access region-specific content
  • Scale crawling - Crawl more pages without getting blocked

Installation

Install Scrapy using pip:

pip install scrapy

Scrapy has built-in proxy support—no additional packages needed to get started.

Basic Proxy Configuration

Set proxies per-request using request.meta["proxy"]:

import scrapy

class BasicProxySpider(scrapy.Spider):
    name = "basic_proxy"
    
    def start_requests(self):
        yield scrapy.Request(
            url="https://api.ipify.org?format=json",
            meta={
                "proxy": "http://username:password@PROXYHOST:PORT"
            },
            callback=self.parse
        )
    
    def parse(self, response):
        data = response.json()
        self.logger.info(f"Request through IP: {data['ip']}")
        yield {"ip": data["ip"]}

Run with: scrapy runspider spider.py

Proxy Middleware for All Requests

Apply proxy to all requests with middleware:

# middlewares.py
class ProxyMiddleware:
    def process_request(self, request, spider):
        request.meta["proxy"] = "http://user:pass@PROXYHOST:PORT"
# settings.py
DOWNLOADER_MIDDLEWARES = {
    "myproject.middlewares.ProxyMiddleware": 350,
}

Rotating Proxy Middleware

Rotate between multiple proxy locations:

# middlewares.py
import random

class RotatingProxyMiddleware:
    PROXIES = [
        "http://user:pass@PROXYHOST:PORT",
        "http://user:pass@PROXYHOST:PORT",
        "http://user:pass@PROXYHOST:PORT",
        "http://user:pass@PROXYHOST:PORT",
    ]
    
    def process_request(self, request, spider):
        request.meta["proxy"] = random.choice(self.PROXIES)

IP Authentication

With your IP whitelisted in ProxyMesh, no credentials needed:

yield scrapy.Request(
    url="https://example.com",
    meta={"proxy": "http://PROXYHOST:PORT"}
)

Custom Proxy Headers

For basic proxy usage, Scrapy works out of the box. For advanced control with custom proxy headers over HTTPS, use the scrapy-proxy-headers extension:

pip install scrapy-proxy-headers
# settings.py
DOWNLOAD_HANDLERS = {
    "https": "scrapy_proxy_headers.HTTP11ProxyDownloadHandler"
}
# spider.py
import scrapy

class ProxyHeadersSpider(scrapy.Spider):
    name = "proxy_headers"
    
    def start_requests(self):
        yield scrapy.Request(
            url="https://api.ipify.org?format=json",
            meta={
                "proxy": "http://user:pass@PROXYHOST:PORT",
                "proxy_headers": {"X-ProxyMesh-Country": "US"}
            },
            callback=self.parse
        )
    
    def parse(self, response):
        # Access proxy response header
        proxy_ip = response.headers.get(b"X-ProxyMesh-IP")
        if proxy_ip:
            self.logger.info(f"Routed through: {proxy_ip.decode()}")

See the scrapy-proxy-headers documentation for more details.

Common Use Cases

E-commerce Product Scraping

import scrapy

class ProductSpider(scrapy.Spider):
    name = "products"
    start_urls = ["https://shop.example.com/products"]
    
    custom_settings = {
        "CONCURRENT_REQUESTS": 16,
        "DOWNLOAD_DELAY": 0.5,
    }
    
    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url,
                meta={"proxy": "http://user:pass@PROXYHOST:PORT"}
            )
    
    def parse(self, response):
        for product in response.css("div.product"):
            yield {
                "name": product.css("h2::text").get(),
                "price": product.css("span.price::text").get(),
                "url": response.urljoin(product.css("a::attr(href)").get())
            }
        
        # Follow pagination
        next_page = response.css("a.next::attr(href)").get()
        if next_page:
            yield scrapy.Request(
                response.urljoin(next_page),
                meta={"proxy": "http://user:pass@PROXYHOST:PORT"},
                callback=self.parse
            )

Geographic Content Variations

import scrapy

class GeoSpider(scrapy.Spider):
    name = "geo"
    
    LOCATIONS = {
        "us": "PROXYHOST:PORT",
        "uk": "PROXYHOST:PORT",
        "de": "PROXYHOST:PORT",
    }
    
    def start_requests(self):
        for location, proxy_host in self.LOCATIONS.items():
            yield scrapy.Request(
                url="https://example.com/localized-content",
                meta={
                    "proxy": f"http://user:pass@{proxy_host}",
                    "location": location
                },
                callback=self.parse_location
            )
    
    def parse_location(self, response):
        yield {
            "location": response.meta["location"],
            "content": response.css("div.content::text").get()
        }

Retry with Different Proxy on Failure

# middlewares.py
import random

class RetryProxyMiddleware:
    PROXIES = [
        "http://user:pass@PROXYHOST:PORT",
        "http://user:pass@PROXYHOST:PORT",
        "http://user:pass@PROXYHOST:PORT",
    ]
    
    def process_response(self, request, response, spider):
        if response.status in [403, 429, 503]:
            # Retry with different proxy
            new_proxy = random.choice(self.PROXIES)
            request.meta["proxy"] = new_proxy
            return request.copy()
        return response
    
    def process_exception(self, request, exception, spider):
        # Retry with different proxy on connection error
        new_proxy = random.choice(self.PROXIES)
        request.meta["proxy"] = new_proxy
        return request.copy()

Complete Production Spider

import scrapy

class ProductionSpider(scrapy.Spider):
    name = "production"
    
    custom_settings = {
        "CONCURRENT_REQUESTS": 8,
        "DOWNLOAD_DELAY": 1,
        "RETRY_TIMES": 3,
        "RETRY_HTTP_CODES": [500, 502, 503, 504, 408, 429],
    }
    
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.proxy = "http://user:pass@PROXYHOST:PORT"
    
    def start_requests(self):
        urls = [f"https://example.com/page/{i}" for i in range(1, 101)]
        for url in urls:
            yield scrapy.Request(
                url,
                meta={"proxy": self.proxy},
                callback=self.parse,
                errback=self.handle_error
            )
    
    def parse(self, response):
        if response.status == 200:
            yield {
                "url": response.url,
                "title": response.css("title::text").get(),
            }
    
    def handle_error(self, failure):
        self.logger.error(f"Failed: {failure.request.url}")

ProxyMesh Headers Reference

Send these headers to control proxy behavior:

  • X-ProxyMesh-Country - Route through a specific country (e.g., "US"). Only works with world proxy or open proxy
  • X-ProxyMesh-IP - Request a specific outgoing IP address
  • X-ProxyMesh-Not-IP - Exclude specific IPs from rotation

The proxy returns X-ProxyMesh-IP in the response with the IP address used.

Resources

Related Python Proxy Guides

Explore proxy configuration for other Python HTTP libraries:

Start Free Trial