import pandas as pd
import matplotlib.pyplot as plt
from openpyxl import Workbook
# open and load data
df = pd.read_excel("data/Lemonade.xlsx")
df.head()
| Date | Day | Temperature | Rainfall | Flyers | Price | Sales | Unnamed: 7 | Unnamed: 8 | Unnamed: 9 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2017-01-01 | Sunday | 27.0 | 2.00 | 15 | 0.3 | 10 | NaN | NaN | NaN |
| 1 | 2017-01-02 | Monday | 28.9 | 1.33 | 15 | 0.3 | 13 | NaN | NaN | NaN |
| 2 | 2017-01-03 | Tuesday | 34.5 | 1.33 | 27 | 0.3 | 15 | NaN | NaN | NaN |
| 3 | 2017-01-04 | Wednesday | 44.1 | 1.05 | 28 | 0.3 | 17 | NaN | NaN | NaN |
| 4 | 2017-01-05 | Thursday | 42.4 | 1.00 | 33 | 0.3 | 18 | NaN | NaN | NaN |
df.columns
Index(['Date', 'Day', 'Temperature', 'Rainfall', 'Flyers', 'Price', 'Sales',
'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9'],
dtype='object')
# drop unused columns
df = df.drop(columns=['Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9'])
df.head()
| Date | Day | Temperature | Rainfall | Flyers | Price | Sales | |
|---|---|---|---|---|---|---|---|
| 0 | 2017-01-01 | Sunday | 27.0 | 2.00 | 15 | 0.3 | 10 |
| 1 | 2017-01-02 | Monday | 28.9 | 1.33 | 15 | 0.3 | 13 |
| 2 | 2017-01-03 | Tuesday | 34.5 | 1.33 | 27 | 0.3 | 15 |
| 3 | 2017-01-04 | Wednesday | 44.1 | 1.05 | 28 | 0.3 | 17 |
| 4 | 2017-01-05 | Thursday | 42.4 | 1.00 | 33 | 0.3 | 18 |
# get the min and max sales
print("Min Sales:", df['Sales'].min())
print("Max Sales:", df['Sales'].max())
Min Sales: 7 Max Sales: 43
# get record with the highest sales
df[df.Sales == df.Sales.max()]
| Date | Day | Temperature | Rainfall | Flyers | Price | Sales | |
|---|---|---|---|---|---|---|---|
| 181 | 2017-07-01 | Saturday | 102.9 | 0.47 | 59 | 0.5 | 43 |
| 207 | 2017-07-27 | Thursday | 97.9 | 0.47 | 74 | 0.5 | 43 |
# get record with the lowest sales
df[df.Sales == df.Sales.min()]
| Date | Day | Temperature | Rainfall | Flyers | Price | Sales | |
|---|---|---|---|---|---|---|---|
| 364 | 2017-12-31 | Sunday | 15.1 | 2.5 | 9 | 0.3 | 7 |
# find correlation (relationship) between Temperature and Sales
df['Temperature'].corr(df['Sales'])
0.9898320849779685
# find correlation (relationship) between Rainfall and Sales
df['Rainfall'].corr(df['Sales'])
-0.9092139324101026
# plot Temperature vs Sales
plt.xlabel("Temperature")
plt.ylabel("Sales")
plt.grid()
plt.scatter(df['Temperature'], df['Sales'])
plt.show()
# plot Rainfall vs Sales
plt.xlabel("Rainfall")
plt.ylabel("Sales")
plt.grid()
plt.scatter(df['Rainfall'], df['Sales'])
plt.show()