-
Notifications
You must be signed in to change notification settings - Fork 0
/
4ChanStockScraper.py
40 lines (31 loc) · 2.26 KB
/
4ChanStockScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import collections
import re
import requests
from bs4 import BeautifulSoup
def get_common_phrases(thread_url, phrase_length, exclude_list=None, exclude_dict=None):
response = requests.get(thread_url)
response.raise_for_status()
phrases = collections.Counter()
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')
posts = soup.find_all(class_='postMessage')
for post in posts:
# Extract phrases of specified length consisting of all capital letters
text = post.get_text()
matches = re.findall(r'\b[A-Z]{%d}\b' % phrase_length, text)
for match in matches:
if match not in exclude_list and match not in exclude_dict:
phrases[match] += 1
# Get the most common phrases
most_common_phrases = phrases.most_common(10) # Change the number as desired
return most_common_phrases
# On the next line change the URL to the current Stock Market General on BIZ
thread_url = 'https://boards.4channel.org/biz/thread/55352526'
phrase_length = 3 # or 4, depending on your desired length
exclude_list = ['ABCD', 'EFGH'] # Specify the phrases you want to exclude in a list
exclude_dict = {'IJKL': 1, 'MNOP': 2} # Specify the phrases and their counts to exclude in a dictionary
exclude_list = ['TURN', 'PLAN','PUMP', 'AUSE', 'ONTH', 'SAND', 'INGS', 'WILL', 'MENT', 'MARK', 'RKET', 'RADI', 'YEAR', 'IONS', 'COMP', 'INVE', 'ADIN', 'ESTI', 'VEST', 'FORE', 'NVES', 'EDTO', 'VERY', 'WHAT', 'SIDE', 'TYOU', 'MONE', 'THES', 'CASH', 'EYOU', 'LIKE', 'IGHT', 'STOC', 'ARKE', 'DONT', 'ABOU', 'BOUT', 'BOND', 'ERES', 'ANDT', 'ATIO', 'ESTO', 'TOCK', 'LOCK', 'ROCK', 'COCK', 'TRAD', 'STHE', 'MAKE', 'OUGH', 'DING', 'INGA', 'ONEY', 'INGI', 'THAT', 'CUCK', 'THER', 'TTHE', 'TRAN', 'HAVE', 'THIN', 'JUST', 'TING', 'INGT', 'ETHE', 'THIS', 'THEM', 'YOUR', 'HERE', 'WITH', 'TION', 'ALLY', 'NTHE', 'SOME', 'OULD', 'THEY', 'OFTH', 'FTHE', 'THEN', 'HTTP', 'TTPS', 'HING', 'WHEN', 'OTHE', 'KING', 'EVER'] # Specify the phrases you want to exclude in a list
common_phrases = get_common_phrases(thread_url, phrase_length, exclude_list, exclude_dict)
print(f'Most common {phrase_length}-letter phrases:')
for phrase, count in common_phrases:
print(f'{phrase}: {count} occurrences')