Scrapy Proxy Configuration
Scrapy is the most powerful Python web crawling and scraping framework. With built-in support for concurrent requests, automatic throttling, data pipelines, and extensible middleware, Scrapy is the go-to choice for large-scale data extraction.
Scrapy has built-in proxy support through request.meta["proxy"], making it easy to route crawl traffic through ProxyMesh rotating proxies. This guide shows you how to configure proxies for reliable crawling without IP bans.
Why Use Proxies with Scrapy?
- Avoid IP bans - Rotate through different IPs automatically
- Bypass rate limits - Distribute requests across many addresses
- Geographic targeting - Access region-specific content
- Scale crawling - Crawl more pages without getting blocked
Installation
Install Scrapy using pip:
pip install scrapy
Scrapy has built-in proxy support—no additional packages needed to get started.
Basic Proxy Configuration
Set proxies per-request using request.meta["proxy"]:
import scrapy
class BasicProxySpider(scrapy.Spider):
name = "basic_proxy"
def start_requests(self):
yield scrapy.Request(
url="https://api.ipify.org?format=json",
meta={
"proxy": "http://username:password@PROXYHOST:PORT"
},
callback=self.parse
)
def parse(self, response):
data = response.json()
self.logger.info(f"Request through IP: {data['ip']}")
yield {"ip": data["ip"]}
Run with: scrapy runspider spider.py
Proxy Middleware for All Requests
Apply proxy to all requests with middleware:
# middlewares.py
class ProxyMiddleware:
def process_request(self, request, spider):
request.meta["proxy"] = "http://user:pass@PROXYHOST:PORT"
# settings.py
DOWNLOADER_MIDDLEWARES = {
"myproject.middlewares.ProxyMiddleware": 350,
}
Rotating Proxy Middleware
Rotate between multiple proxy locations:
# middlewares.py
import random
class RotatingProxyMiddleware:
PROXIES = [
"http://user:pass@PROXYHOST:PORT",
"http://user:pass@PROXYHOST:PORT",
"http://user:pass@PROXYHOST:PORT",
"http://user:pass@PROXYHOST:PORT",
]
def process_request(self, request, spider):
request.meta["proxy"] = random.choice(self.PROXIES)
IP Authentication
With your IP whitelisted in ProxyMesh, no credentials needed:
yield scrapy.Request(
url="https://example.com",
meta={"proxy": "http://PROXYHOST:PORT"}
)
Custom Proxy Headers
For basic proxy usage, Scrapy works out of the box. For advanced control with custom proxy headers over HTTPS, use the scrapy-proxy-headers extension:
pip install scrapy-proxy-headers
# settings.py
DOWNLOAD_HANDLERS = {
"https": "scrapy_proxy_headers.HTTP11ProxyDownloadHandler"
}
# spider.py
import scrapy
class ProxyHeadersSpider(scrapy.Spider):
name = "proxy_headers"
def start_requests(self):
yield scrapy.Request(
url="https://api.ipify.org?format=json",
meta={
"proxy": "http://user:pass@PROXYHOST:PORT",
"proxy_headers": {"X-ProxyMesh-Country": "US"}
},
callback=self.parse
)
def parse(self, response):
# Access proxy response header
proxy_ip = response.headers.get(b"X-ProxyMesh-IP")
if proxy_ip:
self.logger.info(f"Routed through: {proxy_ip.decode()}")
See the scrapy-proxy-headers documentation for more details.
Common Use Cases
E-commerce Product Scraping
import scrapy
class ProductSpider(scrapy.Spider):
name = "products"
start_urls = ["https://shop.example.com/products"]
custom_settings = {
"CONCURRENT_REQUESTS": 16,
"DOWNLOAD_DELAY": 0.5,
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
meta={"proxy": "http://user:pass@PROXYHOST:PORT"}
)
def parse(self, response):
for product in response.css("div.product"):
yield {
"name": product.css("h2::text").get(),
"price": product.css("span.price::text").get(),
"url": response.urljoin(product.css("a::attr(href)").get())
}
# Follow pagination
next_page = response.css("a.next::attr(href)").get()
if next_page:
yield scrapy.Request(
response.urljoin(next_page),
meta={"proxy": "http://user:pass@PROXYHOST:PORT"},
callback=self.parse
)
Geographic Content Variations
import scrapy
class GeoSpider(scrapy.Spider):
name = "geo"
LOCATIONS = {
"us": "PROXYHOST:PORT",
"uk": "PROXYHOST:PORT",
"de": "PROXYHOST:PORT",
}
def start_requests(self):
for location, proxy_host in self.LOCATIONS.items():
yield scrapy.Request(
url="https://example.com/localized-content",
meta={
"proxy": f"http://user:pass@{proxy_host}",
"location": location
},
callback=self.parse_location
)
def parse_location(self, response):
yield {
"location": response.meta["location"],
"content": response.css("div.content::text").get()
}
Retry with Different Proxy on Failure
# middlewares.py
import random
class RetryProxyMiddleware:
PROXIES = [
"http://user:pass@PROXYHOST:PORT",
"http://user:pass@PROXYHOST:PORT",
"http://user:pass@PROXYHOST:PORT",
]
def process_response(self, request, response, spider):
if response.status in [403, 429, 503]:
# Retry with different proxy
new_proxy = random.choice(self.PROXIES)
request.meta["proxy"] = new_proxy
return request.copy()
return response
def process_exception(self, request, exception, spider):
# Retry with different proxy on connection error
new_proxy = random.choice(self.PROXIES)
request.meta["proxy"] = new_proxy
return request.copy()
Complete Production Spider
import scrapy
class ProductionSpider(scrapy.Spider):
name = "production"
custom_settings = {
"CONCURRENT_REQUESTS": 8,
"DOWNLOAD_DELAY": 1,
"RETRY_TIMES": 3,
"RETRY_HTTP_CODES": [500, 502, 503, 504, 408, 429],
}
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.proxy = "http://user:pass@PROXYHOST:PORT"
def start_requests(self):
urls = [f"https://example.com/page/{i}" for i in range(1, 101)]
for url in urls:
yield scrapy.Request(
url,
meta={"proxy": self.proxy},
callback=self.parse,
errback=self.handle_error
)
def parse(self, response):
if response.status == 200:
yield {
"url": response.url,
"title": response.css("title::text").get(),
}
def handle_error(self, failure):
self.logger.error(f"Failed: {failure.request.url}")
ProxyMesh Headers Reference
Send these headers to control proxy behavior:
X-ProxyMesh-Country- Route through a specific country (e.g., "US"). Only works with world proxy or open proxyX-ProxyMesh-IP- Request a specific outgoing IP addressX-ProxyMesh-Not-IP- Exclude specific IPs from rotation
The proxy returns X-ProxyMesh-IP in the response with the IP address used.
Resources
- Scrapy Documentation
- scrapy-proxy-headers Documentation
- ProxyMesh Headers Reference
- Example Code on GitHub
Related Python Proxy Guides
Explore proxy configuration for other Python HTTP libraries:
- aiohttp - High-performance async HTTP client
- Requests - Simple Python HTTP for scripts
- CloudScraper - Bypass Cloudflare protection