Monday, 18 November 2019

quantopian lecture Pandas

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#mean = 1, standard deviation = 0.03, 10 stocks, 100 samples/stock
returns = pd.DataFrame(np.random.normal(1, 0.03, (100, 10)))
#accumulative product of return
prices = returns.cumprod()
prices.plot()
plt.title('randomly generated prices')
plt.xlabel('time')
plt.ylabel('price')
plt.legend(loc=0)


s = pd.Series([1, 2, np.nan, 4, 5])
print s
print s.index

0    1.0
1    2.0
2    NaN
3    4.0
4    5.0
dtype: float64
RangeIndex(start=0, stop=5, step=1)

new_index = pd.date_range('2019-01-01', periods = 5, freq='D')
print new_index
s.index = new_index

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05'],
              dtype='datetime64[ns]', freq='D')

s.iloc[0]
s.iloc[len(s)-1]
s.iloc[:2]
s.iloc[0: len(s): 1]

2019-01-01    1.0
2019-01-02    2.0
2019-01-03    NaN
2019-01-04    4.0
2019-01-05    5.0
Freq: D, dtype: float64

s.iloc[::-1]

2019-01-05    5.0
2019-01-04    4.0
2019-01-03    NaN
2019-01-02    2.0
2019-01-01    1.0
Freq: -1D, dtype: float64

s.loc['2019-01-02':'2019-01-04']

2019-01-02    2.0
2019-01-03    NaN
2019-01-04    4.0
Freq: D, dtype: float64

#Boolean Indexing
print s < 3

2016-01-01     True
2016-01-02     True
2016-01-03    False
2016-01-04    False
2016-01-05    False
Freq: D, Name: Toy Series, dtype: bool

print s.loc[s < 3]

2016-01-01    1.0
2016-01-02    2.0
Freq: D, Name: Toy Series, dtype: float64

print s.loc[(s < 3) & (s > 1)]

2016-01-02    2.0
Freq: D, Name: Toy Series, dtype: float64

symbol = "CMG"
start = "2012-01-01"
end = "2016-01-01"
prices = get_pricing(symbol, start_date=start, end_date=end, fields="price")

print "\n", type(prices)
prices.head(5)

<class 'pandas.core.series.Series'>
2012-01-03 00:00:00+00:00    340.980
2012-01-04 00:00:00+00:00    348.740
2012-01-05 00:00:00+00:00    349.990
2012-01-06 00:00:00+00:00    348.950
2012-01-09 00:00:00+00:00    339.522
Freq: C, Name: Equity(28016 [CMG]), dtype: float64

monthly_prices = prices.resample('M').mean()
monthly_prices.head(10)

2012-01-31 00:00:00+00:00    354.812100
2012-02-29 00:00:00+00:00    379.582000
2012-03-31 00:00:00+00:00    406.996182
2012-04-30 00:00:00+00:00    422.818500
2012-05-31 00:00:00+00:00    405.811091
2012-06-30 00:00:00+00:00    403.068571
2012-07-31 00:00:00+00:00    353.849619
2012-08-31 00:00:00+00:00    294.516522
2012-09-30 00:00:00+00:00    326.566316
2012-10-31 00:00:00+00:00    276.545333
Freq: M, Name: Equity(28016 [CMG]), dtype: float64

monthly_prices_med = prices.resample('M').median()
monthly_prices_med.head(10)

2012-01-31 00:00:00+00:00    355.380
2012-02-29 00:00:00+00:00    378.295
2012-03-31 00:00:00+00:00    408.850
2012-04-30 00:00:00+00:00    420.900
2012-05-31 00:00:00+00:00    405.390
2012-06-30 00:00:00+00:00    402.790
2012-07-31 00:00:00+00:00    380.370
2012-08-31 00:00:00+00:00    295.380
2012-09-30 00:00:00+00:00    332.990
2012-10-31 00:00:00+00:00    286.440
Freq: M, Name: Equity(28016 [CMG]), dtype: float64

def custom_resampler(array_like):
    """ Returns the first value of the period """
    return array_like[0]

first_of_month_prices = prices.resample('M').apply(custom_resampler)
first_of_month_prices.head(10)

2012-01-31 00:00:00+00:00    340.98
2012-02-29 00:00:00+00:00    370.84
2012-03-31 00:00:00+00:00    394.58
2012-04-30 00:00:00+00:00    418.65
2012-05-31 00:00:00+00:00    419.78
2012-06-30 00:00:00+00:00    397.14
2012-07-31 00:00:00+00:00    382.97
2012-08-31 00:00:00+00:00    280.60
2012-09-30 00:00:00+00:00    285.91
2012-10-31 00:00:00+00:00    316.13
Freq: M, Name: Equity(28016 [CMG]), dtype: float64

#Missing Data
#fill in the missing days with the mean price of all days.
meanfilled_prices = calendar_prices.fillna(calendar_prices.mean())

#fill in the missing days with the next known value.
bfilled_prices = calendar_prices.fillna(method='bfill')

#drop missing data
dropped_prices = calendar_prices.dropna()

prices.plot();
# We still need to add the axis labels and title ourselves
plt.title(symbol + " Prices")
plt.ylabel("Price")
plt.xlabel("Date");


print "Summary Statistics"
print prices.describe()

Summary Statistics
count    1006.000000
mean      501.637439
std       146.697204
min       236.240000
25%       371.605000
50%       521.280000
75%       646.753750
max       757.770000
Name: Equity(28016 [CMG]), dtype: float64

#inject noise
noisy_prices = prices + 5 * pd.Series(np.random.normal(0, 5, len(prices)), index=prices.index) + 20
noisy_prices.plot();
plt.title(symbol + " Prices")
plt.ylabel("Price")
plt.xlabel("Date");


#first order differential 
add_returns = prices.diff()[1:]

add_returns.plot();
# We still need to add the axis labels and title ourselves
plt.title(symbol + " Prices 1st order diff")
plt.ylabel("Price diff")
plt.xlabel("Date");

#percent change( multiplicative returns)
percent_change = prices.pct_change()[1:]

plt.title("Multiplicative returns of " + symbol)
plt.xlabel("Date")
plt.ylabel("Percent Returns")
percent_change.plot();


#prices and multiplicative returns
ax1 = plt.subplot2grid((6, 1), (0, 0), rowspan=5, colspan=1)
ax2 = plt.subplot2grid((6, 1), (5, 0), rowspan=1, colspan=1, sharex=ax1)

ax1.plot(prices.index, prices)
ax2.plot(percent_change.index, percent_change)


#30 day rolling mean
rolling_mean = prices.rolling(window=30,center=False).mean()
rolling_mean.name = "30-day rolling mean"

prices.plot()
rolling_mean.plot()
plt.title(symbol + "Price")
plt.xlabel("Date")
plt.ylabel("Price")
plt.legend();


No comments:

Post a Comment