-
Notifications
You must be signed in to change notification settings - Fork 1
/
ecen350_reference_sheet.tex
404 lines (378 loc) · 17.5 KB
/
ecen350_reference_sheet.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
% Josh_Wright_Resume.tex
% (c) Copyright 2015 Josh Wright
\documentclass[12pt]{article}
\usepackage{verbatim}
% \usepackage{syntonly}
\usepackage{ragged2e}
\usepackage{geometry}
\usepackage{enumitem} % for longenum
\usepackage{setspace}
\usepackage{hyperref}
\usepackage{tabularx}
\usepackage{graphicx}
\usepackage{outlines} % for outline
\usepackage{paralist} % for compactitem (compact itemize)
\usepackage{multicol} % for multicolumn layout
\geometry{letterpaper, margin=0.4in, top=0.2in}
% \geometry{letterpaper, margin=0.5in, top=0.35in, left=1.5in}
\pagenumbering{gobble}
\begin{document}
\begin{spacing}{0.8}
%%%%%%%%%%%%%%%%%%
%% main section %%
%%%%%%%%%%%%%%%%%%
\begin{multicols*}{2}
\begin{flushleft}
\newlist{longenum}{itemize}{5}
\setlist[longenum,1]{nosep,leftmargin=0.2cm,labelwidth=0px,align=left,label=$\bullet$}
\setlist[longenum,2]{nosep,leftmargin=0.2cm,labelwidth=0px,align=left,label=$\ast$}
\setlist[longenum,3]{nosep,leftmargin=0.2cm,labelwidth=0px,align=left,label=-}
\setlist[longenum,4]{nosep,leftmargin=0.2cm,labelwidth=0px,align=left,label=>}
\setlist[longenum,5]{nosep,leftmargin=0.2cm,labelwidth=0px,align=left,label=@}
% \begin{outline}[compactitem]
\begin{outline}[longenum]
%%%%%%%%%%%%%%%%%%%%
%% spacing config %%
%%%%%%%%%%%%%%%%%%%%
% just in case I need even more space
\newlength{\upspacelength}
\setlength{\upspacelength}{0px}
\newcommand{\upspace}{\vspace{\upspacelength}}
% section titles
\newcommand{\zzz}[1]{\upspace \0 \textbf{#1} }
% \newcommand{\zzz}[1]{\0 \hspace{-1.25in} \textbf{#1} \vspace{-10px} }
% makes second-level itemize bullets instead of dashes
% \renewcommand\labelitemii{\labelitemi}
% redefine the sub-headings to inject our space-saver
\let\oldOne\1\let\oldTwo\2\let\oldThree\3\let\oldFour\4
\renewcommand{\1}{\oldOne \hspace{-8px}}
\renewcommand{\2}{\oldTwo \hspace{-8px}}
\renewcommand{\3}{\oldThree \hspace{-8px}}
\renewcommand{\4}{\oldFour \hspace{-8px}}
\small
\noindent ECEN350 Ref Sheet \hfill \textcopyright \space Josh Wright \today
% \zzz{Hex}
% \\
% \begin{tabular}{|r r r|}\hline
% 0 & \verb|0x0| & \verb|0000| \\
% 1 & \verb|0x1| & \verb|0001| \\
% 2 & \verb|0x2| & \verb|0010| \\
% 4 & \verb|0x4| & \verb|0100| \\
% 8 & \verb|0x8| & \verb|1000| \\ \hline
% \end{tabular}
\zzz{Registers}
\\
\begin{tabular}{|l r l l|}\hline
n & $_{10}$ & hex & bin \\\hline
\verb|$0 | & \verb| 0| & \verb|0x00| & \verb|00000| \\ \hline
\verb|$at| & \verb| 1| & \verb|0x01| & \verb|00001| \\ \hline \hline
\verb|$v0| & \verb| 2| & \verb|0x02| & \verb|00010| \\ \hline
\verb|$v1| & \verb| 3| & \verb|0x03| & \verb|00011| \\ \hline \hline
\verb|$a0| & \verb| 4| & \verb|0x04| & \verb|00100| \\ \hline
\verb|$a1| & \verb| 5| & \verb|0x05| & \verb|00101| \\ \hline
\verb|$a2| & \verb| 6| & \verb|0x06| & \verb|00110| \\ \hline
\verb|$a3| & \verb| 7| & \verb|0x07| & \verb|00111| \\ \hline \hline
\verb|$t0| & \verb| 8| & \verb|0x08| & \verb|01000| \\ \hline
\verb|$t1| & \verb| 9| & \verb|0x09| & \verb|01001| \\ \hline
\verb|$t2| & \verb|10| & \verb|0x0a| & \verb|01010| \\ \hline
\verb|$t3| & \verb|11| & \verb|0x0b| & \verb|01011| \\ \hline
\verb|$t4| & \verb|12| & \verb|0x0c| & \verb|01100| \\ \hline
\verb|$t5| & \verb|13| & \verb|0x0d| & \verb|01101| \\ \hline
\verb|$t6| & \verb|14| & \verb|0x0e| & \verb|01110| \\ \hline
\verb|$t7| & \verb|15| & \verb|0x0f| & \verb|01111| \\ \hline
\end{tabular}
\begin{tabular}{|l r l l|}\hline
\verb|$s0| & \verb|16| & \verb|0x10| & \verb|10000| \\ \hline
\verb|$s1| & \verb|17| & \verb|0x11| & \verb|10001| \\ \hline
\verb|$s2| & \verb|18| & \verb|0x12| & \verb|10010| \\ \hline
\verb|$s3| & \verb|19| & \verb|0x13| & \verb|10011| \\ \hline
\verb|$s4| & \verb|20| & \verb|0x14| & \verb|10100| \\ \hline
\verb|$s5| & \verb|21| & \verb|0x15| & \verb|10101| \\ \hline
\verb|$s6| & \verb|22| & \verb|0x16| & \verb|10110| \\ \hline
\verb|$s7| & \verb|23| & \verb|0x17| & \verb|10111| \\ \hline \hline
\verb|$t8| & \verb|24| & \verb|0x18| & \verb|11000| \\ \hline
\verb|$t9| & \verb|25| & \verb|0x19| & \verb|11001| \\ \hline \hline
\verb|$k0| & \verb|26| & \verb|0x1a| & \verb|11010| \\ \hline
\verb|$k1| & \verb|27| & \verb|0x1b| & \verb|11011| \\ \hline
\verb|$gp| & \verb|28| & \verb|0x1c| & \verb|11100| \\ \hline
\verb|$sp| & \verb|29| & \verb|0x1d| & \verb|11101| \\ \hline
\verb|$fp| & \verb|30| & \verb|0x1e| & \verb|11110| \\ \hline
\verb|$ra| & \verb|31| & \verb|0x1f| & \verb|11111| \\ \hline
\end{tabular}
\1 callee saved registers: \verb|$s0-$s7, $sp, $gp, $fp|
\2 save parent's value at beginning of function
\1 caller saved registers: basically all the others
\2 save your value before calling subroutine
\1 general format is to list destination first, then operands
\zzz{Clock Rate}
\\
\begin{tabular}{|r r|} \hline
period & rate \\ \hline
1 msec & 1 MHz \\ \hline
100 nsec & 10 MHz \\ \hline
% 5 nsec & 200 MHz \\ \hline
\end{tabular}
\begin{tabular}{|r r|} \hline
10 nsec & 100 MHz \\ \hline
% 2 nsec & 500 MHz \\ \hline
1 nsec & 1 GHz \\ \hline
% 500 psec & 2 GHz \\ \hline
% 250 psec & 4 GHz \\ \hline
% 200 psec & 5 GHz \\ \hline
100 psec & 10 GHz \\ \hline
\end{tabular}
\zzz{Metric Prefixes} \\
\begin{tabular}{|c c l l|} \hline
peta & P & $10^{ 15}$ & \hfill 1 000 000 000 000 000 \\ \hline
tera & T & $10^{ 12}$ & \hfill 1 000 000 000 000 \\ \hline
giga & G & $10^{ 9}$ & \hfill 1 000 000 000 \\ \hline
mega & M & $10^{ 6}$ & \hfill 1 000 000 \\ \hline
kilo & k & $10^{ 3}$ & \hfill 1 000 \\ \hline
hecto & h & $10^{ 2}$ & \hfill 100 \\ \hline
deca & da & $10^{ 1}$ & \hfill 10 \\ \hline
one & & $10^{ 0 }$ & \hfill 1 \hfill \hfill \\ \hline
deci & d & $10^{- 1}$ & 0.1 \\ \hline
centi & c & $10^{- 2}$ & 0.01 \\ \hline
milli & m & $10^{- 3}$ & 0.001 \\ \hline
micro & $\mu$ & $10^{- 6}$ & 0.000 001 \\ \hline
nano & n & $10^{- 9}$ & 0.000 000 001 \\ \hline
pico & p & $10^{-12}$ & 0.000 000 000 001 \\ \hline
femto & f & $10^{-15}$ & 0.000 000 000 000 001 \\ \hline
\end{tabular}
% \zzz{J format (absolute branching)}
% \1 cannot change the top 4 bits of PC. (\verb|PC[31:28]|)
% \1 range:
% \2 total of $2^{26}$ instructions or $2^{28}$ bytes
% \3 because range is $[0,2^{26}-1]$
% \2 farthest possible next instruction is $2^{26}$ away (if \verb|PC+4| lies at the beginning of a $2^{28}$ byte boundary)
% \2 worst case is you can only jump 1 instruction ahead (if \verb|PC+4| lies at the end of a $2^{28}$ byte boundary)
% \1 conversion:
% \2 instruction stores 26 bits
% \2 right pad with two 0s to get 28
% \2 take the top four bits from current PC to get 32
% \1 mask of top 4 bits: \verb|0xF0000000|
% \1 \verb|target = (PC AND 0xF0000000) OR (addr << 2)|
% % python: j = lambda PC,addr: hex((PC & 0xF0000000) | (addr << 2))
% \zzz{Relative Branching}
% \1 range: $[ PC - 2^{17}, PC + 2^{17} - 4 ]$
% \2 that's in bytes. It's a range of $2^{15}-1$ words
% \2 you lose one from the exponent because it's 2's complement
% \1 conversion
% \2 take 16 bit offset, zero pad by 2 (multiply by 4)
% \2 add to PC+4 (next PC)
% \1 \verb|target = (PC + 4) + (addr << 2)|
% \1 due to the \verb|PC+4| thing, if you want to jump back to the same instruction, the immediate value will be -1
\zzz{Endianness}
\\
Value: \verb|0xA0B0C0D0|\\
\1
\begin{tabular}{l r r r r}
index & 0 & 1 & 2 & 3 \\
little & \verb|0xD0| & \verb|0xC0| & \verb|0xB0| & \verb|0xA0| \\
big & \verb|0xA0| & \verb|0xB0| & \verb|0xC0| & \verb|0xD0| \\
\end{tabular}
\2 Little Endian puts the least significant (littlest) stuff first
\1 x86 is little endian, MIPS is big endian
\1 networking is done in big endian
\zzz{Two's Complement}
\1 $N$ bits can represent a range $[ -2^N, +2^N - 1 ]$
\1 methods for converting negative values
\1 method 1:
\2 start with absolute value
\2 flip all bits (bitwise not)
\2 add 1
\1 method 2:
\2 use $N+1$ bits ($2^N$ is $N+1$ bits)
\2 start with absolute value $x$
\2 find $2^N - x$
\2 truncate
\zzz{Shifts}
\1 shift left always fills with 0s
\1 \textbf{Logical} left shift fills with 0s
\1 \textbf{Arithmetic} left shift sign-extends
\2 extends based on far left bit (most significant)
\zzz{Assembler}
\1 Spilling: when a compiler puts a variable in main memory because it's run out of registers
\2 the variable has spilled to RAM
\2 inverse is filling
\1 Object file sections: header; text; data; relocation information; symbol table; debugging information
\2 Object file is assembled assuming that instructions start at \verb|0x00|. (this is corrected later by the linker)
\1 Global label can be referenced in any file
\2 you must declare it global in the file where it is defined, and declare it global again where it's used
\2 \verb|main| must be global so the linker can find it
\2 \verb|printf| is global so you can use it (but you must still declare it as global in that file where you use it)
\1 local label can be referenced in only the current file
\2 labels are local by default
\1 \textbf{Symbol Table:} contains all external references
\2 also lists unresolved references (e.g. printf)
\2 as far as assembler is concerned, symbol table contains both local and global labels, resolved and unresolved.
\2 The final assembled object file only contains global labels
\1 \textbf{Relocation Table:} contains references to all things that depend on absolute addresses
\2 e.g. all absolute jumps, load address
\2 these must be changed after loading into memory
\2 does not contain addresses of labels
%%%%%%%%%%%%%%
%%% exam 2 %%%
%%%%%%%%%%%%%%
% \zzz{Verilog}
% \1 always block: synthesize to combinational logic iff:
% \2 everything written to is always written exactly once for every case of inputs
% \2 the outputs of the always block depend only on inputs that are in the sensitivity list
% \2 stuff assigned to inside an always block must be declard \verb|reg|
% \3 will be optimized out if it's combinational
% \1 bitwise not is $\sim$
% \1 ternary operator: \verb|cond ? if_true : if_false|
% \1 assignments: \verb|=| is blocking, \verb|<=| is non-blocking
% \2 \verb|=|: happens in order
% \2 \verb|<=|: happens all at once
% \1 case statement: can use \verb|?| to specify `don't care' for some bits
% \1 \verb|`timescale unit/precision|:
% \2 \verb|unit|: 1, 10, or 100, unit either s, ms, us, ps, fs
% \2 \verb|precision|: must be shorter than \verb|unit|
\zzz{State Machine}
\1 outputs determined by:
\1 Mealy Machine: current state and current inputs
\1 Moore Machine: current state only
\zzz{Performance}
\1 execution time = (\# of clock cycles) $\times$ (clock cycle time)
= (\# of clock cycles)/(clock rate)
\1 CPI: Cycles Per Instruction
\2 effective CPI is just a weighted average (varies by instruction mix)
\1 instructions per time = CPI / clock rate = CPI * clock period
\1 compare two systems:
\2 use instruction latencies and instruction mix to calculate CPI for each setup
\2 then calculate instructions per time, and do comparison there
\zzz{IEEE Floating-Point}
\1 1 bit sign; 8 bit exponent; 23 bit mantissa
\2 $x = (-1)^s \cdot (1.m) \cdot 2^{e-127}$
\1 sign: 0 for positive, 1 for negative
\1 exponent: bias is $-127$
\1 mantissa: the fractional part; denominator $2^{23}$
\2 implicit leftmost bit is not stored, only fractional
\1 conversion: decimal to float:
\2 start with $x$
\2 use $\lfloor \log_2 \rfloor$ to express $x$ as $a \cdot 2^b$ where $1\leq a < 2$
\2 exponent $=127+b$
\2 mantissa $=(a-1) \cdot 2^{23}$
\3 round to nearest integer
\1 conversion: float to decimal:
\2 real exponent $a = exp-127$
\2 take exponent as integer $\rightarrow a$
\2 decimal = $(1 + \frac{a}{2^{23}}) \cdot 2^a$
\1 calculate mantissa directly: $\frac{x}{2^{\lfloor \log_2(x) \rfloor}} \cdot 2^{23}$
\1 mantissa the long way:
\\ take right-of-decimal part and repeatedly multiply by 2. On each iteration, the 1's place is that bit in the mantissa. (starting from leftmost bit)
\1 quantity of numbers on range $[2^n, 2^{n+1}] = 2^{23} + 1$
\\ quantity of numbers on range $[2^n, 2^{n+1}) = 2^{23}$
\2 the $2^{n+1}$ bumps it up because the exponent changes
\1 next largest float: add $2^{-23}$ to mantissa (assuming exponent doesn't change, i.e. number isn't evenly $2^n$)
\1
\begin{tabular}{|l l l|} \hline
exponent & mantissa & meaning \\ \hline
0 & 0 & $\pm$zero \\ \hline
0 & $\not=0$ & denormalized \\ \hline
1-254 & any & normal \\ \hline
255 & 0 & $\pm\infty$ \\ \hline
255 & $\not=0$ & NaN \\ \hline
\end{tabular}
\1
\begin{tabular}{|l r r|} \hline
$ $ & float & double \\ \hline
sign & 1 bit & 1 bit \\ \hline
exponent & 8 bits & 11 bits \\ \hline
exp bias & 127 & 1023 \\ \hline
exp min & -126 & -1022 \\ \hline
exp max & +127 & +1023 \\ \hline
mantissa & 23 bits & 52 bits \\ \hline
% max &
% min &
\end{tabular}
\1 minimum integer that can't be exactly represented: $2^{24} + 1 = $ 16 777 217
% TODO: minimum and maximum of float and double?
%%%%%%%%%%%%%%%%%%
%%% Final Exam %%%
%%%%%%%%%%%%%%%%%%
\zzz{Pipelining}
\1 5 stages: IF, ID, EX, MEM, WB
\1 branch delay stalls when decision made in stage:
\2 MEM: 3 stalls; EX: 2; ID: 1
\1 if a branch is predicted wrong, you must flush the pipeline
\1 ALU data hazards:
\2 w/o forwarding: 2 stalls if the next instruction is dependent
\2 w/forwarding: no delays ever
\1 load-use hazard:
\2 w/o forwarding: 2 delays
\2 w/forwarding: 1 delay for next instruction, then forward from data memory
\1 ideal speedup: instruction time pipelined pipelined $=$ instruction time before / number of stages
\zzz{Branch Prediction}
\1 if predict wrong, flush the pipeline after the branch
\2 set all control and opcode to 0 (\verb|nop|)
\1 static
\1 1-bit: keep a cache of the PC of the branch and last taken/not taken
\1 2-bit: keep a cache of the PC of the branch and state in a state machine
\2 4 states: strong taken, taken, not taken, strong not taken
(ST,T,N,SN)
\2 taken moves state toward ST, not taken moves toward SN
\2 allows otherwise consistent branches to have some variation
\zzz{Cache}
\1 sources of cache miss:
\2 compulsory: when the cache's valid bit is $0$
\3 cannot be avoided because cache must start empty
\2 conflict: the wrong data is there
\3 solve by increasing cache size or associativity
\2 capacity: when all blocks are full and data isn't there
\3 really only happens with fully associative, otherwise it would be conflict
\3 solve by increasing cache size
\1 locality types:
\2 spacial: accessing data that is close to other data
\2 temporal: accessing data that was recently accessed before
\1 fully associative
\2 any data can go anywhere
\2 tag is full address of data
\2 must search entire cache to find anything
\1 directly mapped (1-way associative)
\2 use some low-order bits of the memory address to decide where to put the data in the cache (the index)
\2 means that the tag only needs to be only those bits that aren't in the index (saves space)
\2 also means that other unrelated data can collide with the existing data by simply having the same low-order bits
\1 $n$-way associative
\2 same as directly-mapped except there's $n$ total banks of directly mapped cache to check
\2 (if it's not in one, it might be in another)
\2 you want to replace stuff in LRU order: Least Recently Used
\3 thus every cache line must have a counter that is incremented on every cache read
\1 multi-block
\2 combined with above concepts
\2 store multiple adjacent words from memory together in the cache
\2 whenever you would fetch just one word in a block, fetch the whole block
\2 for each word, you have a block offset and an index
\2 tag is highest order bits of address, then line index, then block index, (then byte offset)
\1 write handling:
\2 naive solution: stall (not optimal)
\2 write allocate (writeback)
\3 write into the cache and set dirty bit
\3 then write to memory when that data is evicted by other data
\2 no-write allocate
\3 set dirty bit, but write to write buffer instead
\3 don't need to stall on writes
\3 must stall for reads of data that was written to the buffer
(stall until buffer is flushed)
\1 CPUtime = IC $*$ CPI $*$ CC
\2 IC: instruction count
\2 CC: clock cycles
\1 CPUtime = IC $*$ (CPI-ideal + Memory-stall cycles ) $*$ CC
\1 write-buffering:
\2 read-stall cycles = reads/program $*$ read miss rate $*$ read miss penalty
\2 write-stall cycles = writes/program $*$ write miss rate $*$ write miss penalty + write buffer stalls
\1 write-back: memory-stall = miss rate $*$ miss penalty
\zzz{Exceptions}
\1 procedure:
\2 set offending instruction and all following to \verb|nop|
(flush pipeline)
\3 but keep instructions that came before offending
\2 set cause and EPC register values
\2 load (hard-coded?) PC of correct exception handler
\2 carry on (in exception handler)
\end{outline}
\end{flushleft}
\end{multicols*}
\end{spacing}
\end{document}