-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
108 lines (85 loc) · 4.11 KB
/
main.py
File metadata and controls
108 lines (85 loc) · 4.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
import time
from schema import *
from agent import LLMagent as agent
from Prompt import ExtractTaskLlm
from Googletask import createTask
def scrape_linkedin_posts(query):
with sync_playwright() as p:
browser = p.chromium.launch_persistent_context(
user_data_dir="./linkedin_session", headless=False
)
page = browser.new_page()
url = f"https://www.linkedin.com/search/results/content/?keywords={query.replace(' ', '%20')}"
# url = f"https://www.linkedin.com/search/results/people/?keywords={query.replace(' ', '%20')}"
page.goto(url)
page.wait_for_selector("div.occludable-update", timeout=20000)
for i in range(6):
page.mouse.wheel(0, 3000)
time.sleep(2)
html = page.content()
soup = BeautifulSoup(html, "html.parser")
for post in soup.find_all("div", class_="feed-shared-update-v2"):
# Extract data-urn
data_urn = post.get("data-urn")
post_url = f"https://www.linkedin.com/feed/update/{data_urn}" if data_urn else "URL not found"
# Extract text
text_div = post.find("div", class_="update-components-text")
text = text_div.get_text(separator="\n", strip=True) if text_div else "No text found"
# Extract author
author_tag = post.find("span", class_="update-components-actor__title")
author_name = author_tag.get_text(strip=True) if author_tag else "Author not found"
agentFeedback = agent(ExtractTaskLlm(url=post_url, text=text))
createTask({
"title" : f"LinkedIn Application -> {agentFeedback["job_title"]}",
"notes" : f"{agentFeedback["hiring_task"]} - {agentFeedback["post_url"]}"
})
input("Press Enter to close the browser...")
browser.close()
# def scrape_linkedin_people(query):
with sync_playwright() as p:
browser = p.chromium.launch_persistent_context(
user_data_dir="./linkedin_session",
headless=False
)
page = browser.new_page()
url = f"https://www.linkedin.com/search/results/people/?keywords={query.replace(' ', '%20')}"
page.goto(url)
# Wait for ANY person card
page.wait_for_selector('a[href*="/in/"]', timeout=20000)
# Scroll to load more
for _ in range(6):
page.mouse.wheel(0, 3000)
time.sleep(2)
soup = BeautifulSoup(page.content(), "html.parser")
# Find all person cards
# cards = soup.select('a[href*="/in/"]')
cards = [
c for c in soup.select('a[href*="/in/"]')
if c.select_one('a[data-view-name="search-result-lockup-title"]')
]
for card in cards:
profile_url = card.get("href")
# Name (first title link)
name_tag = card.select_one('a[data-view-name="search-result-lockup-title"]')
name = name_tag.get_text(strip=True) if name_tag else "No name"
# Headline (first p.bcbad6fa.b47b8486...)
headline_tag = card.select_one("p.bcbad6fa.b47b8486")
headline = headline_tag.get_text(strip=True) if headline_tag else "No headline"
# Location (second occurrence)
location_tag = card.select("p.bcbad6fa.b47b8486")
location = location_tag[1].get_text(strip=True) if len(location_tag) > 1 else "No location"
# Image
img_tag = card.select_one("img[alt]")
img = img_tag.get("src") if img_tag else "No image"
print("\n----- PERSON -----")
print("Name:", name)
print("Headline:", headline)
print("Location:", location)
print("Profile URL:", profile_url)
print("Image URL:", img)
input("Press Enter to close...")
browser.close()
scrape_linkedin_posts('''"freelance project" OR "looking for freelancer" OR "need a freelancer" OR "hiring freelancer" OR "freelance work"''')
# scrape_linkedin_people('''"flutter developer" OR "react native"''')