|
| 1 | +#!/usr/bin/env python |
| 2 | +# coding: utf-8 |
| 3 | + |
| 4 | + |
| 5 | + |
| 6 | +#scrape cftc trader commitment report |
| 7 | + |
| 8 | + |
| 9 | +# In[1]: |
| 10 | + |
| 11 | + |
| 12 | +import requests |
| 13 | +import pandas as pd |
| 14 | +import re |
| 15 | +import os |
| 16 | +os.chdir('H:/') |
| 17 | + |
| 18 | + |
| 19 | +# In[2]: |
| 20 | + |
| 21 | + |
| 22 | +#scraping function |
| 23 | +def scrape(url): |
| 24 | + |
| 25 | + session=requests.Session() |
| 26 | + |
| 27 | + session.headers.update( |
| 28 | + {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}) |
| 29 | + |
| 30 | + response=session.get(url) |
| 31 | + |
| 32 | + return response |
| 33 | + |
| 34 | + |
| 35 | +# In[3]: |
| 36 | + |
| 37 | + |
| 38 | +#get data |
| 39 | +def etl(response): |
| 40 | + |
| 41 | + #create a list |
| 42 | + text=response.content.decode('utf-8').split('\r') |
| 43 | + |
| 44 | + |
| 45 | + #create index for each block |
| 46 | + assets=[i for i in text if 'CHICAGO MERCANTILE EXCHANGE' in i] |
| 47 | + ind=[text.index(i) for i in assets] |
| 48 | + |
| 49 | + |
| 50 | + overall=[] |
| 51 | + |
| 52 | + #etl |
| 53 | + for i in ind: |
| 54 | + |
| 55 | + commodity=text[i].split(' - CHICAGO MERCANTILE EXCHANGE')[0].replace('\n','') |
| 56 | + commodity_code=text[i].split('Code-')[-1].replace('\n','') |
| 57 | + date=re.search('\d{2}\/\d{2}\/\d{2}',text[i+1]).group() |
| 58 | + contractunit=re.search('(?<=\().*(?=OPEN INTEREST)',text[i+7]).group().replace(')','') |
| 59 | + open_interest=re.search('(?<=OPEN INTEREST\:).*',text[i+7]).group() |
| 60 | + non_commercial_long_commitment,non_commercial_short_commitment, \ |
| 61 | + non_commercial_spread_commitment,commercial_long_commitment, \ |
| 62 | + commercial_short_commitment,total_long_commitment,total_short_commitment, \ |
| 63 | + non_reportable_long_commitment,non_reportable_short_commitment=re.findall('\S+',text[i+9]) |
| 64 | + changedate=re.search('\d{2}\/\d{2}\/\d{2}',text[i+11]).group() |
| 65 | + change_open_interest=text[i+11].split(' ')[-1].replace(')','') |
| 66 | + non_commercial_long_change,non_commercial_short_change, \ |
| 67 | + non_commercial_spread_change,commercial_long_change, \ |
| 68 | + commercial_short_change,total_long_change,total_short_change, \ |
| 69 | + non_reportable_long_change,non_reportable_short_change=re.findall('\S+',text[i+12]) |
| 70 | + non_commercial_long_percent,non_commercial_short_percent, \ |
| 71 | + non_commercial_spread_percent,commercial_long_percent, \ |
| 72 | + commercial_short_percent,total_long_percent,total_short_percent, \ |
| 73 | + non_reportable_long_percent,non_reportable_short_percent=re.findall('\S+',text[i+15]) |
| 74 | + totaltraders=text[i+17].split(' ')[-1].replace(')','') |
| 75 | + non_commercial_long_traders,non_commercial_short_traders, \ |
| 76 | + non_commercial_spread_traders,commercial_long_traders, \ |
| 77 | + commercial_short_traders,total_long_traders,total_short_traders=re.findall('\S+',text[i+18]) |
| 78 | + |
| 79 | + temp=[commodity,commodity_code,date,contractunit,open_interest, |
| 80 | + non_commercial_long_commitment,non_commercial_short_commitment, |
| 81 | + non_commercial_spread_commitment,commercial_long_commitment, |
| 82 | + commercial_short_commitment,total_long_commitment, |
| 83 | + total_short_commitment,non_reportable_long_commitment, |
| 84 | + non_reportable_short_commitment,changedate,change_open_interest, |
| 85 | + non_commercial_long_change,non_commercial_short_change, |
| 86 | + non_commercial_spread_change,commercial_long_change, |
| 87 | + commercial_short_change,total_long_change,total_short_change, |
| 88 | + non_reportable_long_change,non_reportable_short_change, |
| 89 | + non_commercial_long_percent,non_commercial_short_percent, |
| 90 | + non_commercial_spread_percent,commercial_long_percent, |
| 91 | + commercial_short_percent,total_long_percent, |
| 92 | + total_short_percent,non_reportable_long_percent, |
| 93 | + non_reportable_short_percent,totaltraders, |
| 94 | + non_commercial_long_traders,non_commercial_short_traders, |
| 95 | + non_commercial_spread_traders,commercial_long_traders, |
| 96 | + commercial_short_traders,total_long_traders,total_short_traders] |
| 97 | + |
| 98 | + overall+=temp |
| 99 | + |
| 100 | + |
| 101 | + colnames=['commodity', |
| 102 | + 'commodity_code', |
| 103 | + 'date', |
| 104 | + 'contract_unit', |
| 105 | + 'open_interest', |
| 106 | + 'non_commercial_long_commitment', |
| 107 | + 'non_commercial_short_commitment', |
| 108 | + 'non_commercial_spread_commitment', |
| 109 | + 'commercial_long_commitment', |
| 110 | + 'commercial_short_commitment', |
| 111 | + 'total_long_commitment', |
| 112 | + 'total_short_commitment', |
| 113 | + 'non_reportable_long_commitment', |
| 114 | + 'non_reportable_short_commitment', |
| 115 | + 'change_date', |
| 116 | + 'change_open_interest', |
| 117 | + 'non_commercial_long_change', |
| 118 | + 'non_commercial_short_change', |
| 119 | + 'non_commercial_spread_change', |
| 120 | + 'commercial_long_change', |
| 121 | + 'commercial_short_change', |
| 122 | + 'total_long_change', |
| 123 | + 'total_short_change', |
| 124 | + 'non_reportable_long_change', |
| 125 | + 'non_reportable_short_change', |
| 126 | + 'non_commercial_long_percent', |
| 127 | + 'non_commercial_short_percent', |
| 128 | + 'non_commercial_spread_percent', |
| 129 | + 'commercial_long_percent', |
| 130 | + 'commercial_short_percent', |
| 131 | + 'total_long_percent', |
| 132 | + 'total_short_percent', |
| 133 | + 'non_reportable_long_percent', |
| 134 | + 'non_reportable_short_percent', |
| 135 | + 'total_traders', |
| 136 | + 'non_commercial_long_traders', |
| 137 | + 'non_commercial_short_traders', |
| 138 | + 'non_commercial_spread_traders', |
| 139 | + 'commercial_long_traders', |
| 140 | + 'commercial_short_traders', |
| 141 | + 'total_long_traders', |
| 142 | + 'total_short_traders'] |
| 143 | + |
| 144 | + |
| 145 | + #create dataframe |
| 146 | + df=pd.DataFrame(columns=colnames) |
| 147 | + |
| 148 | + |
| 149 | + for i in range(len(colnames)): |
| 150 | + df[colnames[i]]=overall[i::len(colnames)] |
| 151 | + |
| 152 | + |
| 153 | + #transform |
| 154 | + ind=['commodity', 'commodity_code','change_date', |
| 155 | + 'date', 'contract_unit', 'open_interest', |
| 156 | + 'change_open_interest','total_traders'] |
| 157 | + |
| 158 | + df=df.melt(id_vars=ind,value_vars=[i for i in df.columns if i not in ind]) |
| 159 | + |
| 160 | + #isolate position |
| 161 | + df['position']='' |
| 162 | + |
| 163 | + ind_long=df.loc[df['variable'].apply(lambda x: 'long' in x )].index |
| 164 | + ind_short=df.loc[df['variable'].apply(lambda x: 'short' in x )].index |
| 165 | + ind_spread=df.loc[df['variable'].apply(lambda x: 'spread' in x )].index |
| 166 | + |
| 167 | + for i in ind_spread: |
| 168 | + df.at[i,'position']='spread' |
| 169 | + for i in ind_short: |
| 170 | + df.at[i,'position']='short' |
| 171 | + for i in ind_long: |
| 172 | + df.at[i,'position']='long' |
| 173 | + |
| 174 | + df['variable']=df['variable'].str.replace('long_','').str.replace('short_','').str.replace('spread_','') |
| 175 | + |
| 176 | + #isolate type |
| 177 | + df['type']=df['variable'].apply(lambda x:'_'.join(x.split('_')[:-1])) |
| 178 | + |
| 179 | + #clean variable name |
| 180 | + df['variable']=df['variable'].apply(lambda x:x.split('_')[-1]) |
| 181 | + |
| 182 | + df['variable']=df['variable'].str.replace('percent', |
| 183 | + 'percent_of_open_interest_for_each_type_of_traders') |
| 184 | + |
| 185 | + df['variable']=df['variable'].str.replace('traders', |
| 186 | + 'number_of_traders_in_each_type') |
| 187 | + |
| 188 | + #change col order |
| 189 | + df=df[['commodity', 'commodity_code', 'change_date', |
| 190 | + 'date', 'contract_unit','open_interest', |
| 191 | + 'change_open_interest', 'total_traders', |
| 192 | + 'type','position','variable','value', ]] |
| 193 | + |
| 194 | + return df |
| 195 | + |
| 196 | + |
| 197 | +# In[4]: |
| 198 | + |
| 199 | +def main(): |
| 200 | + |
| 201 | + url='https://www.cftc.gov/dea/futures/deacmesf.htm' |
| 202 | + |
| 203 | + #scrape |
| 204 | + response=scrape(url) |
| 205 | + |
| 206 | + #get data |
| 207 | + df=etl(option_url) |
| 208 | + |
| 209 | + df.to_csv('trader commitment report.csv',index=False) |
| 210 | + |
| 211 | + |
| 212 | +if __name__ == "__main__": |
| 213 | + main() |
| 214 | + |
0 commit comments