Chuanshuoge: quantopian lecture Pandas

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#mean = 1, standard deviation = 0.03, 10 stocks, 100 samples/stock
returns = pd.DataFrame(np.random.normal(1, 0.03, (100, 10)))
#accumulative product of return
prices = returns.cumprod()
prices.plot()
plt.title('randomly generated prices')
plt.xlabel('time')
plt.ylabel('price')
plt.legend(loc=0)

s = pd.Series([1, 2, np.nan, 4, 5])
print s
print s.index

0 1.0
1 2.0
2 NaN
3 4.0
4 5.0
dtype: float64
RangeIndex(start=0, stop=5, step=1)

new_index = pd.date_range('2019-01-01', periods = 5, freq='D')
print new_index
s.index = new_index

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
'2019-01-05'],
dtype='datetime64[ns]', freq='D')

s.iloc[0]
s.iloc[len(s)-1]
s.iloc[:2]
s.iloc[0: len(s): 1]

2019-01-01 1.0
2019-01-02 2.0
2019-01-03 NaN
2019-01-04 4.0
2019-01-05 5.0
Freq: D, dtype: float64

s.iloc[::-1]

2019-01-05 5.0
2019-01-04 4.0
2019-01-03 NaN
2019-01-02 2.0
2019-01-01 1.0
Freq: -1D, dtype: float64

s.loc['2019-01-02':'2019-01-04']

2019-01-02 2.0
2019-01-03 NaN
2019-01-04 4.0
Freq: D, dtype: float64

#Boolean Indexing
print s < 3

2016-01-01 True
2016-01-02 True
2016-01-03 False
2016-01-04 False
2016-01-05 False
Freq: D, Name: Toy Series, dtype: bool

print s.loc[s < 3]

2016-01-01 1.0
2016-01-02 2.0
Freq: D, Name: Toy Series, dtype: float64

print s.loc[(s < 3) & (s > 1)]

2016-01-02 2.0
Freq: D, Name: Toy Series, dtype: float64

symbol = "CMG"
start = "2012-01-01"
end = "2016-01-01"
prices = get_pricing(symbol, start_date=start, end_date=end, fields="price")

print "\n", type(prices)
prices.head(5)

<class 'pandas.core.series.Series'>
2012-01-03 00:00:00+00:00 340.980
2012-01-04 00:00:00+00:00 348.740
2012-01-05 00:00:00+00:00 349.990
2012-01-06 00:00:00+00:00 348.950
2012-01-09 00:00:00+00:00 339.522
Freq: C, Name: Equity(28016 [CMG]), dtype: float64

monthly_prices = prices.resample('M').mean()
monthly_prices.head(10)

2012-01-31 00:00:00+00:00 354.812100
2012-02-29 00:00:00+00:00 379.582000
2012-03-31 00:00:00+00:00 406.996182
2012-04-30 00:00:00+00:00 422.818500
2012-05-31 00:00:00+00:00 405.811091
2012-06-30 00:00:00+00:00 403.068571
2012-07-31 00:00:00+00:00 353.849619
2012-08-31 00:00:00+00:00 294.516522
2012-09-30 00:00:00+00:00 326.566316
2012-10-31 00:00:00+00:00 276.545333
Freq: M, Name: Equity(28016 [CMG]), dtype: float64

monthly_prices_med = prices.resample('M').median()
monthly_prices_med.head(10)

2012-01-31 00:00:00+00:00 355.380
2012-02-29 00:00:00+00:00 378.295
2012-03-31 00:00:00+00:00 408.850
2012-04-30 00:00:00+00:00 420.900
2012-05-31 00:00:00+00:00 405.390
2012-06-30 00:00:00+00:00 402.790
2012-07-31 00:00:00+00:00 380.370
2012-08-31 00:00:00+00:00 295.380
2012-09-30 00:00:00+00:00 332.990
2012-10-31 00:00:00+00:00 286.440
Freq: M, Name: Equity(28016 [CMG]), dtype: float64

def custom_resampler(array_like):
""" Returns the first value of the period """
return array_like[0]

first_of_month_prices = prices.resample('M').apply(custom_resampler)
first_of_month_prices.head(10)

2012-01-31 00:00:00+00:00 340.98
2012-02-29 00:00:00+00:00 370.84
2012-03-31 00:00:00+00:00 394.58
2012-04-30 00:00:00+00:00 418.65
2012-05-31 00:00:00+00:00 419.78
2012-06-30 00:00:00+00:00 397.14
2012-07-31 00:00:00+00:00 382.97
2012-08-31 00:00:00+00:00 280.60
2012-09-30 00:00:00+00:00 285.91
2012-10-31 00:00:00+00:00 316.13
Freq: M, Name: Equity(28016 [CMG]), dtype: float64

#Missing Data
#fill in the missing days with the mean price of all days.
meanfilled_prices = calendar_prices.fillna(calendar_prices.mean())

#fill in the missing days with the next known value.
bfilled_prices = calendar_prices.fillna(method='bfill')

#drop missing data
dropped_prices = calendar_prices.dropna()

prices.plot();
# We still need to add the axis labels and title ourselves
plt.title(symbol + " Prices")
plt.ylabel("Price")
plt.xlabel("Date");

print "Summary Statistics"
print prices.describe()

Summary Statistics
count 1006.000000
mean 501.637439
std 146.697204
min 236.240000
25% 371.605000
50% 521.280000
75% 646.753750
max 757.770000
Name: Equity(28016 [CMG]), dtype: float64

#inject noise
noisy_prices = prices + 5 * pd.Series(np.random.normal(0, 5, len(prices)), index=prices.index) + 20
noisy_prices.plot();
plt.title(symbol + " Prices")
plt.ylabel("Price")
plt.xlabel("Date");

#first order differential

add_returns = prices.diff()[1:]

add_returns.plot();
# We still need to add the axis labels and title ourselves
plt.title(symbol + " Prices 1st order diff")
plt.ylabel("Price diff")
plt.xlabel("Date");

#percent change( multiplicative returns)
percent_change = prices.pct_change()[1:]

plt.title("Multiplicative returns of " + symbol)
plt.xlabel("Date")
plt.ylabel("Percent Returns")
percent_change.plot();

#prices and multiplicative returns
ax1 = plt.subplot2grid((6, 1), (0, 0), rowspan=5, colspan=1)
ax2 = plt.subplot2grid((6, 1), (5, 0), rowspan=1, colspan=1, sharex=ax1)

ax1.plot(prices.index, prices)
ax2.plot(percent_change.index, percent_change)

#30 day rolling mean
rolling_mean = prices.rolling(window=30,center=False).mean()
rolling_mean.name = "30-day rolling mean"

prices.plot()
rolling_mean.plot()
plt.title(symbol + "Price")
plt.xlabel("Date")
plt.ylabel("Price")
plt.legend();

Chuanshuoge

Monday, 18 November 2019

quantopian lecture Pandas

No comments:

Post a Comment

Chuanshuoge, Calgary, Canada, Earth, Solar system, Milky Way Galaxy

_ChuanShuo^Ge - _传说^哥