AutoScraper Proxy Configuration
AutoScraper is an intelligent automatic web scraping library that uses machine learning to learn extraction rules from examples. Instead of writing CSS selectors or XPath, you show AutoScraper what data you want, and it figures out how to extract similar data.
Combined with ProxyMesh rotating proxies, AutoScraper enables scalable, intelligent data extraction without getting blocked. The library is built on Requests, so it supports proxy configuration for large-scale automated scraping.
How AutoScraper Works
- Provide examples - Give AutoScraper a URL and example data
- Learn rules - AutoScraper analyzes the page and learns patterns
- Extract at scale - Apply learned rules to similar pages
Perfect for e-commerce scraping, search results, directories, and news aggregation.
Installation
Install AutoScraper using pip:
pip install autoscraper
AutoScraper is built on Requests, so it supports proxy configuration through request_args.
Basic Proxy Configuration
AutoScraper's build() and get_result methods accept request_args for proxy configuration:
from autoscraper import AutoScraper
scraper = AutoScraper()
# Proxy configuration
proxies = {
"http": "http://username:password@PROXYHOST:PORT",
"https": "http://username:password@PROXYHOST:PORT"
}
# Train on example page
url = "https://books.toscrape.com/"
wanted_list = ["A Light in the Attic", "£51.77"]
scraper.build(url, wanted_list, request_args={"proxies": proxies})
# Extract from another page
results = scraper.get_result_similar(
"https://books.toscrape.com/catalogue/page-2.html",
request_args={"proxies": proxies}
)
print(results)
Save and Load Trained Scrapers
from autoscraper import AutoScraper
proxies = {"https": "http://user:pass@PROXYHOST:PORT"}
# Train and save
scraper = AutoScraper()
scraper.build(
"https://news-site.com",
["Headline Text", "Author Name"],
request_args={"proxies": proxies}
)
scraper.save("news_scraper")
# Load and use later
loaded = AutoScraper()
loaded.load("news_scraper")
results = loaded.get_result_similar(
"https://news-site.com/latest",
request_args={"proxies": proxies}
)
Multiple Data Types
from autoscraper import AutoScraper
scraper = AutoScraper()
proxies = {"https": "http://user:pass@PROXYHOST:PORT"}
# Train with multiple example values
scraper.build(
"https://shop.example.com/products",
["Product Name", "$29.99", "In Stock"],
request_args={"proxies": proxies}
)
results = scraper.get_result_similar(
"https://shop.example.com/products?page=2",
group_by_alias=True,
request_args={"proxies": proxies}
)
Custom Proxy Headers
AutoScraper uses Requests internally. For custom proxy headers, you can fetch pages manually and pass HTML to AutoScraper:
pip install python-proxy-headers
from autoscraper import AutoScraper
from python_proxy_headers import requests_adapter
# Fetch with proxy headers
response = requests_adapter.get(
"https://books.toscrape.com/",
proxies={"https": "http://user:pass@PROXYHOST:PORT"},
proxy_headers={"X-ProxyMesh-Country": "US"}
)
print(f"Routed through: {response.headers.get('X-ProxyMesh-IP')}")
# Train AutoScraper with fetched HTML
scraper = AutoScraper()
scraper.build(
"https://books.toscrape.com/",
["A Light in the Attic", "£51.77"],
html=response.text
)
# Continue using standard proxy for extraction
proxies = {"https": "http://user:pass@PROXYHOST:PORT"}
results = scraper.get_result_similar(
"https://books.toscrape.com/catalogue/page-2.html",
request_args={"proxies": proxies}
)
Common Use Cases
Product Catalog Scraping
from autoscraper import AutoScraper
scraper = AutoScraper()
proxies = {"https": "http://user:pass@PROXYHOST:PORT"}
# Train on product listing
scraper.build(
"https://shop.example.com/products",
["Product Name", "$99.99", "In Stock"],
request_args={"proxies": proxies}
)
# Set aliases for organized output
scraper.set_rule_aliases({
"rule_0": "name",
"rule_1": "price",
"rule_2": "availability"
})
# Scrape multiple pages
all_products = []
for page in range(1, 11):
results = scraper.get_result_similar(
f"https://shop.example.com/products?page={page}",
group_by_alias=True,
request_args={"proxies": proxies}
)
if results:
names = results.get("name", [])
prices = results.get("price", [])
all_products.extend(zip(names, prices))
print(f"Page {page}: {len(names)} products")
print(f"Total: {len(all_products)} products")
Rotating Proxy Locations
from autoscraper import AutoScraper
import random
PROXIES = [
{"https": "http://user:pass@PROXYHOST:PORT"},
{"https": "http://user:pass@PROXYHOST:PORT"},
{"https": "http://user:pass@PROXYHOST:PORT"},
]
scraper = AutoScraper()
# Train with one proxy
scraper.build(
"https://example.com",
["Example Data"],
request_args={"proxies": PROXIES[0]}
)
# Extract with rotation
for url in urls:
proxy = random.choice(PROXIES)
results = scraper.get_result_similar(url, request_args={"proxies": proxy})
print(f"{url}: {len(results)} items")
News Aggregation
from autoscraper import AutoScraper
scraper = AutoScraper()
proxies = {"https": "http://user:pass@PROXYHOST:PORT"}
# Train on news site
scraper.build(
"https://news-site.com",
["Breaking News Headline", "2 hours ago"],
request_args={"proxies": proxies}
)
# Apply to multiple news sites
sites = [
"https://news-site.com/latest",
"https://another-news.com",
]
for site in sites:
try:
results = scraper.get_result_similar(
site,
request_args={"proxies": proxies, "timeout": 30}
)
print(f"{site}: {len(results)} items")
except Exception as e:
print(f"{site}: Error - {e}")
Exact vs Similar Matching
from autoscraper import AutoScraper
scraper = AutoScraper()
proxies = {"https": "http://user:pass@PROXYHOST:PORT"}
scraper.build(
"https://example.com",
["Exact Text"],
request_args={"proxies": proxies}
)
# Exact matches only
exact = scraper.get_result_exact(
"https://example.com/page2",
request_args={"proxies": proxies}
)
# Similar structure matches (more results)
similar = scraper.get_result_similar(
"https://example.com/page2",
request_args={"proxies": proxies}
)
print(f"Exact: {len(exact)}, Similar: {len(similar)}")
Error Handling with Retries
from autoscraper import AutoScraper
import random
import time
PROXIES = [
{"https": "http://user:pass@PROXYHOST:PORT"},
{"https": "http://user:pass@PROXYHOST:PORT"},
]
def scrape_with_retry(scraper, url, max_retries=3):
for attempt in range(max_retries):
proxy = random.choice(PROXIES)
try:
results = scraper.get_result_similar(
url,
request_args={"proxies": proxy, "timeout": 30}
)
if results:
return results
except Exception as e:
print(f"Attempt {attempt + 1} failed: {e}")
time.sleep(2 ** attempt)
return None
# Usage
scraper = AutoScraper()
scraper.build("https://example.com", ["Data"], request_args={"proxies": PROXIES[0]})
results = scrape_with_retry(scraper, "https://example.com/page2")
Data Validation
from autoscraper import AutoScraper
import re
def is_price(text):
return bool(re.match(r'^\$?\d+\.?\d*$', text.replace(',', '')))
scraper = AutoScraper()
proxies = {"https": "http://user:pass@PROXYHOST:PORT"}
scraper.build(
"https://shop.example.com",
["Product Name", "$50.00"],
request_args={"proxies": proxies}
)
results = scraper.get_result_similar(
"https://shop.example.com/page2",
request_args={"proxies": proxies}
)
# Separate names from prices
names = [r for r in results if not is_price(r)]
prices = [r for r in results if is_price(r)]
print(f"Names: {len(names)}, Prices: {len(prices)}")
ProxyMesh Headers Reference
Send these headers to control proxy behavior:
X-ProxyMesh-Country- Route through a specific country (e.g., "US"). Only works with world proxy or open proxyX-ProxyMesh-IP- Request a specific outgoing IP addressX-ProxyMesh-Not-IP- Exclude specific IPs from rotation
The proxy returns X-ProxyMesh-IP in the response with the IP address used.
Resources
- AutoScraper Documentation
- python-proxy-headers Documentation
- ProxyMesh Headers Reference
- Example Code on GitHub
Related Python Proxy Guides
Explore proxy configuration for other Python HTTP libraries:
- Requests - The library AutoScraper is built on
- Scrapy - Full web crawling framework
- CloudScraper - Bypass Cloudflare protection