Scrape LinkedIn and Zillow in Python using ISP proxies. Avoid bans, gather data, and maximize speed.
In the modern age of data, web scraping can open the door to all sorts of insights—from finding job leads on LinkedIn to discovering new housing opportunities on Zillow. Yet scraping the same site over and over with the same IP can quickly lead to bans, rate limiting, or extra verification. That’s where proxies come to the rescue. By rotating your IP address from a list of proxies, you look more like regular users spread across different addresses—keeping your scraping smooth and uninterrupted.
Many sites look for the default Python user-agent string (python-requests
) to spot bots. Switching up your user-agent can help you blend in:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
" AppleWebKit/537.36 (KHTML, like Gecko)"
" Chrome/58.0.3029.110 Safari/537.3"
}
Including these headers in your requests can make your script appear more like a standard web browser.
Goal: Collect public job postings or headlines from LinkedIn’s job search page.
Challenge: LinkedIn may rate-limit or ban repeated requests from the same IP.
import requests
from bs4 import BeautifulSoup
import time
import random
# A list of proxy endpoints (ISP IPs), each with credentials.
proxy_list = [
"http://user1:pass1@proxy1.statproxies.com:3128",
"http://user2:pass2@proxy2.statproxies.com:3128",
"http://user3:pass3@proxy3.statproxies.com:3128"
]
headers = {
"User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/58.0.3029.110 Safari/537.3")
}
def fetch_linkedin_jobs(url):
"""
Fetch job postings from LinkedIn's public jobs page
using a random ISP proxy and a custom user-agent.
"""
proxy = random.choice(proxy_list)
print(f"Using proxy: {proxy}")
try:
response = requests.get(
url,
proxies={"http": proxy, "https": proxy},
headers=headers,
timeout=10
)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Example: find job cards
jobs = soup.select(".base-card__full-link")
job_titles = [job.get_text(strip=True) for job in jobs]
return job_titles
except requests.exceptions.RequestException as e:
print(f"Request failed with proxy {proxy}: {e}")
return []
def main_linkedin():
linkedin_url = "https://www.linkedin.com/jobs/search/?f_TPR=r2592000&geoId=103644278&keywords=python"
all_job_titles = fetch_linkedin_jobs(linkedin_url)
print("\nLinkedIn Job Titles Found:")
for title in all_job_titles[:5]: # Show only the first five
print("-", title)
# Demo call
if __name__ == "__main__":
main_linkedin()
".base-card__full-link"
class and grab text from each.Goal: Gather listing info from Zillow, such as addresses and prices.
Challenge: Zillow may throttle or block your IP after many requests without rotation.
import requests
from bs4 import BeautifulSoup
import time
proxy_list = [
"http://user1:pass1@proxy1.statproxies.com:3128",
"http://user2:pass2@proxy2.statproxies.com:3128",
"http://user3:pass3@proxy3.statproxies.com:3128"
]
headers = {
"User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/58.0.3029.110 Safari/537.3")
}
def get_zillow_listings(url, proxy):
"""
Fetches Zillow listings using the provided proxy and a custom user-agent.
"""
print(f"Using proxy: {proxy}")
try:
resp = requests.get(
url,
proxies={"http": proxy, "https": proxy},
headers=headers,
timeout=10
)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
listings = soup.select(".list-card-info")
data = []
for listing in listings:
address_tag = listing.select_one(".list-card-addr")
price_tag = listing.select_one(".list-card-price")
link_tag = listing.select_one("a.list-card-link")
address = address_tag.get_text(strip=True) if address_tag else "N/A"
price = price_tag.get_text(strip=True) if price_tag else "N/A"
link = link_tag.get("href") if link_tag else "N/A"
data.append((address, price, link))
return data
except requests.exceptions.RequestException as e:
print(f"Request failed with proxy {proxy}: {e}")
return []
def main_zillow():
zillow_url = "https://www.zillow.com/homes/for_sale/"
all_data = []
for i, proxy in enumerate(proxy_list):
result = get_zillow_listings(zillow_url, proxy)
all_data.extend(result)
# Wait between requests
time.sleep(3)
print("\nZillow Listings Collected:")
for entry in all_data[:5]: # Show only the first five
print("Address:", entry[0], "| Price:", entry[1], "| Link:", entry[2])
# Demo call
if __name__ == "__main__":
main_zillow()
proxy_list
, distributing requests.If you’re tired of getting rate-limited or blocked, proxies can keep your data pipeline flowing. That’s where Stat Proxies can help. We offer: