import pandas as pd
import re
import datetime
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np
from scipy import stats

tsla = pd.read_csv("TSLA.csv")
amzn = pd.read_csv("AMZN.csv")
pfe = pd.read_csv("PFE.csv")
gme = pd.read_csv("GME.csv")

comments = pd.read_csv("comments.csv")
comments.head()


lst = [0] * len(tsla) # List of zeroes to add an average price to
lst1 = [''] * len(tsla) # list of zeroes to add correctly formmatted date to
for index, row in tsla.iterrows():
    lst[index] = .5 * (row['Open'] + row['Close']) # Take the average of open and close price
    x = row['Date'].split('/')
    lst1[index] = str(datetime.datetime(int('20' + x[2]),int(x[0]),int(x[1])).date()) # Format the data as a string of the date object
tsla['Date'] = lst1 # Add the new data and mean price t the dataframe
tsla['mean'] = lst

# Create a dictionary that maps the date to a mean price for that stock
date = list(tsla['Date']) 
mean = list(tsla['mean'])
tsla_dict = {date[i]: mean[i] for i in range(len(tsla))}

lst = [0] * len(amzn)
lst1 = [''] * len(amzn)
for index, row in amzn.iterrows():
    lst[index] = .5 * (row['Open'] + row['Close'])
    x = row['Date'].split('/')
    lst1[index] = str(datetime.datetime(int('20' + x[2]),int(x[0]),int(x[1])).date())
amzn['Date'] = lst1
amzn['mean'] = lst
date = list(amzn['Date'])
mean = list(amzn['mean'])
amzn_dict = {date[i]: mean[i] for i in range(len(amzn))}

lst = [0] * len(pfe)
lst1 = [''] * len(pfe)
for index, row in pfe.iterrows():
    lst[index] = .5 * (row['Open'] + row['Close'])
    x = row['Date'].split('/')
    lst1[index] = str(datetime.datetime(int('20' + x[2]),int(x[0]),int(x[1])).date())
pfe['Date'] = lst1
pfe['mean'] = lst
date = list(pfe['Date'])
mean = list(pfe['mean'])
pfe_dict = {date[i]: mean[i] for i in range(len(pfe))}

lst = [0] * len(gme)
lst1 = [''] * len(gme)
for index, row in gme.iterrows():
    lst[index] = .5 * (row['Open'] + row['Close'])
    x = row['Date'].split('/')
    lst1[index] = str(datetime.datetime(int('20' + x[2]),int(x[0]),int(x[1])).date())
gme['Date'] = lst1
gme['mean'] = lst
date = list(gme['Date'])
mean = list(gme['mean'])
gme_dict = {date[i]: mean[i] for i in range(len(gme))}


lst1 = [0] * len(comments) # Empty lists for various columns to be added to the dataframe
lst2 = [0] * len(comments)
lst3 = [0] * len(comments)
lst4 = [0] * len(comments)
lst5 = [0] * len(comments)
lst6 = [''] * len(comments)
lst7 = [''] * len(comments)
lst8 = [''] * len(comments)
lst9 = [''] * len(comments)
lst10 = [None] * len(comments)
for index, row in comments.iterrows():
    
    # Regexes for each company
    lst1[index] = len(re.findall('[Aa][Mm][Zz][Nn]|[Aa][Mm][Aa][Zz][Oo][Nn]', row['body']))
    lst2[index] = len(re.findall('[Tt][Ss][Ll][Aa]|[Tt][Ee][Ss][Ll][Aa]', row['body']))
    lst3[index] = len(re.findall('[Gg][Mm][Ee]|[Gg][Aa][Mm][Ee][Ss][Tt][Oo][Pp]', row['body']))
    lst4[index] = len(re.findall('[Pp][Ff][Ee]|[Pp][Ff][Ii][Zz][Ee][Rr]', row['body']))
    date = datetime.datetime.fromtimestamp(row['created_utc']) # Create date from epoch time
    lst10[index] = date.date()
    lst5[index] = str(date.date())

    # Add Tesla's price on the given date
    if lst5[index] in tsla_dict:
        lst6[index] = tsla_dict[lst5[index]]
    elif str((date - datetime.timedelta(days=1)).date()) in tsla_dict:
        lst6[index] = tsla_dict[str((date - datetime.timedelta(days=1)).date())]
    else:
        lst6[index] = tsla_dict[str((date - datetime.timedelta(days=1) -  datetime.timedelta(days=1)).date())]
    
    # Add Amazon's price on the given date
    if lst5[index] in amzn_dict:
        lst7[index] = amzn_dict[lst5[index]]
    elif str((date - datetime.timedelta(days=1)).date()) in amzn_dict:
        lst7[index] = amzn_dict[str((date - datetime.timedelta(days=1)).date())]
    else:
        lst7[index] = amzn_dict[str((date - datetime.timedelta(days=1) -  datetime.timedelta(days=1)).date())]
    
    # Add Pfizer's price on the given date
    if lst5[index] in pfe_dict:
        lst8[index] = pfe_dict[lst5[index]]
    elif str((date - datetime.timedelta(days=1)).date()) in pfe_dict:
        lst8[index] = pfe_dict[str((date - datetime.timedelta(days=1)).date())]
    else:
        lst8[index] = pfe_dict[str((date - datetime.timedelta(days=1) -  datetime.timedelta(days=1)).date())]
    
     # Add Gamestop's price on the given date
    if lst5[index] in gme_dict:
        lst9[index] = gme_dict[lst5[index]]
    elif str((date - datetime.timedelta(days=1)).date()) in gme_dict:
        lst9[index] = gme_dict[str((date - datetime.timedelta(days=1)).date())]
    else:
        lst9[index] = gme_dict[str((date - datetime.timedelta(days=1) -  datetime.timedelta(days=1)).date())]
    
# Add all the new comments to the dataframe
comments['amzn'] = lst1
comments['tsla'] = lst2
comments['gme'] = lst3
comments['pfe'] = lst4
comments['datetime'] = lst5
comments['tsla_price'] = lst6
comments['amzn_price'] = lst7
comments['pfe_price'] = lst8
comments['gme_price'] = lst9
comments['date'] = lst10
comments.head()


x = list(set(comments['date'])) # Get all the dates
colors = mcolors.CSS4_COLORS # Get a list of colors that we can use in graphs
c = list(colors.keys())[:len(x)]
colors = [colors[item] for item in c]

y = {} # The comment volume for each day
yy = {} # The price for each day
for date in x:
    y[date] = sum(comments[comments['date'] == date]['tsla']) # The comment volume is the sum of all instances of the phrases for that day, which corresponds to 100 comments
    yy[date] = max(comments[comments['date'] == date]['tsla_price']) # The price for all comments on that day will be the same, so I just took the max arbitrarily
y1 = [y[item] for item in x] # Get lists for comments and price in the order of the corresponding dates in x
y2 = [yy[item] for item in x]

# Graph Comments
fig, ax = plt.subplots()
plt.scatter(x=x, y=y1, color=colors[10])
plt.title("Tesla Mentions in Comments Over Last 90 days")
plt.xticks(rotation=25)

# Graph Price
fig, ax = plt.subplots()
plt.scatter(x=x, y=y2, color=colors[9])
plt.title("Tesla Price [$] Over Last 90 days")
plt.xticks(rotation=25)
plt.show()

# Calculate lag by indexing days in the corresponding dictionary and subtracting.
tsla_lag = list(yy.keys())[list(yy.values()).index(max(y2))] - list(y.keys())[list(y.values()).index(max(y1))]
print(f'The lag between the max comments and max price is {tsla_lag}')

The lag between the max comments and max price is 14 days, 0:00:00


a = np.array(y1)
b = np.array(y2)
plt.scatter(x=a, y=b) # Scatter price vs. comments

# Use stats to get a linear regression model for price and comments column
m, yint, r, p, se = stats.linregress(a, b)

# Print relevant info about the model
print(f'The results of linear regression can be represented by the function y = {m} * x + {yint}.')
print(f'The correlation coefficient is {r}.')
print(f'The p-value is {p}.')

# Plot the line of regression
plt.plot(a, m*a+yint, 'r')
plt.title("Price vs. Mentions")
plt.xlabel('Mentions')
plt.ylabel('Price')
plt.show()

# Calculate the predicted price based on comments each day and plot it with the actual price
y3 = [m*item+yint for item in y1]
fig, ax = plt.subplots()
plt.scatter(x=x, y=y2, color=colors[9])
plt.scatter(x=x, y=y3, color=colors[10])
plt.title("Tesla Price [$] Over Last 90 days")
plt.legend(["Actual", "Predicted"], bbox_to_anchor=(1.05, 1), ncol=1, loc='upper left')
plt.xticks(rotation=25)
plt.show()

The results of linear regression can be represented by the function y = 10.052571766375776 * x + 934.6818885356132.
The correlation coefficient is 0.2645627223872205.
The p-value is 0.011740506199118293.


yyy = {} # adjusted prices
for key in yy.keys(): # Shift all of the priecs back by the lag
    yyy[key - tsla_lag] = yy[key]
    
adjusted_price = [] # adjusted price
new_mentions = [] # corresponding comments to adjusted price
days = [] # corresponding day
for key in yyy.keys(): # For each item in the adjsuted price array, add it to a new list if the date is also in the original array
    if key in y.keys():
        new_mentions.append(y[key])
        adjusted_price.append(yyy[key])
        days.append(key)

# Scatter price vs. comments
a = np.array(new_mentions)
b = np.array(adjusted_price)
plt.scatter(x=a, y=b)

# Use stats to get a linear regression model for year and lifeExp column
m, yint, r, p, se = stats.linregress(a, b)

# Print relevant info about the model
print(f'The results of linear regression can be represented by the function y = {m} * x + {yint}.')
print(f'The correlation coefficient is {r}.')
print(f'The p-value is {p}.')

# Calculate line of regression and plot on top of scatter
plt.plot(a, m*a+yint, 'r')
plt.title("Price vs. Mentions Adjusted")
plt.xlabel('Mentions')
plt.ylabel('Price')
plt.show()

# Shift the days back
days = [item+tsla_lag for item in days]
new_actual_price = []
for day in days:
    new_actual_price.append(yy[day]) # Add the actual prices on those days
new_predicted_price = [m*item+yint for item in new_mentions] # Calculate the predicted prices on those days

# Plot the predicted and actual prices for the overlapping time period
fig, ax = plt.subplots()
plt.scatter(x=days, y=new_actual_price, color=colors[9])
plt.scatter(x=days, y=new_predicted_price, color=colors[10])
plt.title("Tesla Price [$] Over Last 90 days Adjusted")
plt.legend(["Actual", "Predicted"], bbox_to_anchor=(1.05, 1), ncol=1, loc='upper left')
plt.xticks(rotation=25)
plt.show()

The results of linear regression can be represented by the function y = 12.093862370798323 * x + 963.4208488349734.
The correlation coefficient is 0.37828048131126263.
The p-value is 0.0007537381590420461.


y = {}
yy = {}
for date in x:
    y[date] = sum(comments[comments['date'] == date]['amzn'])
    yy[date] = max(comments[comments['date'] == date]['amzn_price'])
y1 = [y[item] for item in x]
y2 = [yy[item] for item in x]

fig, ax = plt.subplots()
plt.scatter(x=x, y=y1, color=colors[10])
plt.title("Amazon Mentions in Comments Over Last 90 days")
plt.xticks(rotation=25)

fig, ax = plt.subplots()
plt.scatter(x=x, y=y2, color=colors[9])
plt.title("Amazon Price [$] Over Last 90 days")
plt.xticks(rotation=25)
plt.show()

amzn_lag = list(yy.keys())[list(yy.values()).index(max(y2))] - list(y.keys())[list(y.values()).index(max(y1))]
print(f'The lag between the max comments and max price is {amzn_lag}')

The lag between the max comments and max price is 16 days, 0:00:00


yyy = {} # adjusted prices
for key in yy.keys(): # Shift all of the priecs back by the lag
    yyy[key - amzn_lag] = yy[key]
    
adjusted_price = [] # adjusted price
new_mentions = []
days = [] # corresponding day
for key in yyy.keys(): # For each item in the adjsuted price array, add it to a new list if the date is also in the original array
    if key in y.keys():
        new_mentions.append(y[key])
        adjusted_price.append(yyy[key])
        days.append(key)
        
a = np.array(new_mentions)
b = np.array(adjusted_price)
plt.scatter(x=a, y=b)

# Use stats to get a linear regression model for year and lifeExp column
m, yint, r, p, se = stats.linregress(a, b)
# Print relevant info about the model
print(f'The results of linear regression can be represented by the function y = {m} * x + {yint}.')
print(f'The correlation coefficient is {r}.')
print(f'The p-value is {p}.')

plt.plot(a, m*a+yint, 'r')
plt.title("Price vs. Mentions Adjusted")
plt.xlabel('Mentions')
plt.ylabel('Price')
plt.show()

days = [item+amzn_lag for item in days]
new_actual_price = []
for day in days:
    new_actual_price.append(yy[day])
new_predicted_price = [m*item+yint for item in new_mentions]
fig, ax = plt.subplots()
plt.scatter(x=days, y=new_actual_price, color=colors[9])
plt.scatter(x=days, y=new_predicted_price, color=colors[10])
plt.title("Amazon Price [$] Over Last 90 days Adjusted")
plt.legend(["Actual", "Predicted"], bbox_to_anchor=(1.05, 1), ncol=1, loc='upper left')
plt.xticks(rotation=25)
plt.show()

The results of linear regression can be represented by the function y = 37.912525145931596 * x + 3421.2958107178656.
The correlation coefficient is 0.2587892716518686.
The p-value is 0.02598979627193299.


y = {}
yy = {}
for date in x:
    y[date] = sum(comments[comments['date'] == date]['pfe'])
    yy[date] = max(comments[comments['date'] == date]['pfe_price'])
y1 = [y[item] for item in x]
y2 = [yy[item] for item in x]

fig, ax = plt.subplots()
plt.scatter(x=x, y=y1, color=colors[10])
plt.title("Pfizer Mentions in Comments Over Last 90 days")
plt.xticks(rotation=25)

fig, ax = plt.subplots()
plt.scatter(x=x, y=y2, color=colors[9])
plt.title("Pfizer Price [$] Over Last 90 days")
plt.xticks(rotation=25)
plt.show()

pfe_lag = list(yy.keys())[list(yy.values()).index(max(y2))] - list(y.keys())[list(y.values()).index(max(y1))]
print(f'The lag between the max comments and max price is {pfe_lag}')

The lag between the max comments and max price is 37 days, 0:00:00


yyy = {} # adjusted prices
for key in yy.keys(): # Shift all of the priecs back by the lag
    yyy[key - pfe_lag] = yy[key]
    
adjusted_price = [] # adjusted price
new_mentions = []
days = [] # corresponding day
for key in yyy.keys(): # For each item in the adjsuted price array, add it to a new list if the date is also in the original array
    if key in y.keys():
        new_mentions.append(y[key])
        adjusted_price.append(yyy[key])
        days.append(key)
        
a = np.array(new_mentions)
b = np.array(adjusted_price)
plt.scatter(x=a, y=b)

# Use stats to get a linear regression model for year and lifeExp column
m, yint, r, p, se = stats.linregress(a, b)
# Print relevant info about the model
print(f'The results of linear regression can be represented by the function y = {m} * x + {yint}.')
print(f'The correlation coefficient is {r}.')
print(f'The p-value is {p}.')

plt.plot(a, m*a+yint, 'r')
plt.title("Price vs. Mentions Adjusted")
plt.xlabel('Mentions')
plt.ylabel('Price')
plt.show()

days = [item+pfe_lag for item in days]
new_actual_price = []
for day in days:
    new_actual_price.append(yy[day])
new_predicted_price = [m*item+yint for item in new_mentions]
fig, ax = plt.subplots()
plt.scatter(x=days, y=new_actual_price, color=colors[9])
plt.scatter(x=days, y=new_predicted_price, color=colors[10])
plt.title("Pfizer Price [$] Over Last 90 days Adjusted")
plt.legend(["Actual", "Predicted"], bbox_to_anchor=(1.05, 1), ncol=1, loc='upper left')
plt.xticks(rotation=25)
plt.show()

The results of linear regression can be represented by the function y = 0.5991114267883764 * x + 49.43520138450074.
The correlation coefficient is 0.21745524019402543.
The p-value is 0.11778682266217004.


y = {}
yy = {}
for date in x:
    y[date] = sum(comments[comments['date'] == date]['gme'])
    yy[date] = max(comments[comments['date'] == date]['gme_price'])
y1 = [y[item] for item in x]
y2 = [yy[item] for item in x]

fig, ax = plt.subplots()
plt.scatter(x=x, y=y1, color=colors[10])
plt.title("Gamestop Mentions in Comments Over Last 90 days")
plt.xticks(rotation=25)

fig, ax = plt.subplots()
plt.scatter(x=x, y=y2, color=colors[9])
plt.title("Gamestop Price [$] Over Last 90 days")
plt.xticks(rotation=25)
plt.show()

gme_lag = list(yy.keys())[list(yy.values()).index(max(y2))] - list(y.keys())[list(y.values()).index(max(y1))]
print(f'The lag between the max comments and max price is {gme_lag}')

The lag between the max comments and max price is 42 days, 0:00:00


yyy = {} # adjusted prices
for key in yy.keys(): # Shift all of the priecs back by the lag
    yyy[key - gme_lag] = yy[key]
    
adjusted_price = [] # adjusted price
new_mentions = []
days = [] # corresponding day
for key in yyy.keys(): # For each item in the adjsuted price array, add it to a new list if the date is also in the original array
    if key in y.keys():
        new_mentions.append(y[key])
        adjusted_price.append(yyy[key])
        days.append(key)
        
a = np.array(new_mentions)
b = np.array(adjusted_price)
plt.scatter(x=a, y=b)

# Use stats to get a linear regression model for year and lifeExp column
m, yint, r, p, se = stats.linregress(a, b)
# Print relevant info about the model
print(f'The results of linear regression can be represented by the function y = {m} * x + {yint}.')
print(f'The correlation coefficient is {r}.')
print(f'The p-value is {p}.')

plt.plot(a, m*a+yint, 'r')
plt.title("Price vs. Mentions Adjusted")
plt.xlabel('Mentions')
plt.ylabel('Price')
plt.show()

days = [item+gme_lag for item in days]
new_actual_price = []
for day in days:
    new_actual_price.append(yy[day])
new_predicted_price = [m*item+yint for item in new_mentions]
fig, ax = plt.subplots()
plt.scatter(x=days, y=new_actual_price, color=colors[9])
plt.scatter(x=days, y=new_predicted_price, color=colors[10])
plt.title("Gamestop Price [$] Over Last 90 days Adjusted")
plt.legend(["Actual", "Predicted"], bbox_to_anchor=(1.05, 1), ncol=1, loc='upper left')
plt.xticks(rotation=25)
plt.show()

The results of linear regression can be represented by the function y = 0.05930501077199184 * x + 195.7102609878815.
The correlation coefficient is 0.004540453761644439.
The p-value is 0.9755661180977184.

	author	body	created_utc	score
0	HowlLaika	Its not you WHOLE savings. You still got 16 th...	1.639492e+09	1.0
1	lulu_butts	Screen shot that and complain to customer service	1.639492e+09	1.0
2	Fuji-one	I need to sell my car to buy more calls	1.639492e+09	1.0
3	pajamental805	Rip	1.639492e+09	1.0
4	AskMeAboutL00M	don't listen to all these people who tell you ...	1.639492e+09	1.0

	author	body	created_utc	score	datetime	tsla_price	amzn_price	pfe_price	gme_price	date
0	HowlLaika	Its not you WHOLE savings. You still got 16 th...	1.639492e+09	1.0	2021-12-14	951.755005	3366.415039	55.155	139.345001	2021-12-14
1	lulu_butts	Screen shot that and complain to customer service	1.639492e+09	1.0	2021-12-14	951.755005	3366.415039	55.155	139.345001	2021-12-14
2	Fuji-one	I need to sell my car to buy more calls	1.639492e+09	1.0	2021-12-14	951.755005	3366.415039	55.155	139.345001	2021-12-14
3	pajamental805	Rip	1.639492e+09	1.0	2021-12-14	951.755005	3366.415039	55.155	139.345001	2021-12-14
4	AskMeAboutL00M	don't listen to all these people who tell you ...	1.639492e+09	1.0	2021-12-14	951.755005	3366.415039	55.155	139.345001	2021-12-14

Predicting Stock Prices with Reddit Comments¶

Author: Katie Kemp¶

Motivation¶

Data Collection¶

Obtaining Data from Reddit¶

Obtaining Stock Market Data¶

Data Processing¶

Exploratory Analysis and Data Visualization¶

Tesla¶

Analysis, Hypotheis Testing, and Machine Learning¶

Tesla¶

Amazon¶

Pfizer¶

Gamestop¶

Insights¶