deepspeedai
diff --git a/‎.github/workflows/amd.yml
Lines changed: 11 additions & 0 deletions b/‎.github/workflows/amd.yml
Lines changed: 11 additions & 0 deletions
diff --git a/‎.github/workflows/nv-torch12-p40.yml
Lines changed: 9 additions & 0 deletions b/‎.github/workflows/nv-torch12-p40.yml
Lines changed: 9 additions & 0 deletions
diff --git a/‎.github/workflows/nv-torch18-v100.yml
Lines changed: 11 additions & 0 deletions b/‎.github/workflows/nv-torch18-v100.yml
Lines changed: 11 additions & 0 deletions
diff --git a/‎csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu
Lines changed: 6 additions & 2 deletions b/‎csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu
Lines changed: 6 additions & 2 deletions
diff --git a/‎csrc/transformer/inference/csrc/gelu.cu
Lines changed: 26 additions & 19 deletions b/‎csrc/transformer/inference/csrc/gelu.cu
Lines changed: 26 additions & 19 deletions
@@ -37,12 +37,23 @@ jobs:
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
           sudo apt-get update
           sudo apt-get install -y libaio-dev
+
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          # if needed switch to the last known good SHA until transformers@master is fixed
+          # git checkout 1cc453d33
+          git rev-parse --short HEAD
+          pip install .
+
       # Runs a set of commands using the runners shell
       - name: Install deepspeed
         run: |
           sudo /opt/conda/bin/pip install .[dev,1bit,autotuning]
           #python -c "from deepspeed.env_report import cli_main; cli_main()"
           ds_report
+
       # Runs a set of commands using the runners shell
       - name: Unit tests
         run: |
 
@@ -32,6 +32,15 @@ jobs:
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          # if needed switch to the last known good SHA until transformers@master is fixed
+          # git checkout 1cc453d33
+          git rev-parse --short HEAD
+          pip install .
+
       - name: Install deepspeed
         run: |
           pip install .[dev,autotuning]
 
@@ -32,10 +32,21 @@ jobs:
           pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          # if needed switch to the last known good SHA until transformers@master is fixed
+          # git checkout 1cc453d33
+          git rev-parse --short HEAD
+          pip install .
+
       - name: Install deepspeed
         run: |
           pip install .[dev,1bit,autotuning,sparse_attn]
           ds_report
+
       - name: Unit tests
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
 
@@ -4,6 +4,7 @@
 #include <cuda_profiler_api.h>
 #endif
 
+namespace cg = cooperative_groups;
 namespace cg = cooperative_groups;
 
 __global__ void apply_rotary_pos_emb(float* mixed_query,
@@ -153,7 +154,9 @@ __global__ void apply_rotary_pos_emb1(__half* mixed_query,
     int lane = id & 0x1f;
 
     unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
+    unsigned seq_index = head_id % seq_len;
     unsigned offset = head_id * head_size;
+    unsigned k_offset = (seq_index + (head_id / seq_len) * MAX_OUT_TOKES) * head_size;
 
     constexpr unsigned mask[32] = {
         0x1 | 0x1000,     0x2 | 0x2000,     0x4 | 0x4000,     0x8 | 0x8000,     0x10 | 0x10000,
@@ -171,7 +174,7 @@ __global__ void apply_rotary_pos_emb1(__half* mixed_query,
             float inv_freq = (float)((lane % half_dim) * 2) / (float)rotary_dim;
             inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
             float q = (float)mixed_query[offset + lane];
-            float k = (float)key_layer[offset + lane];
+            float k = (float)key_layer[k_offset + lane];
             float rotary_sign = (lane > (half_dim - 1) ? -1.0 : 1.0);
             float q_rot = (q * rotary_sign);
             float k_rot = (k * rotary_sign);
@@ -183,7 +186,7 @@ __global__ void apply_rotary_pos_emb1(__half* mixed_query,
             k = k * cosf(inv_freq) + k_rot_tmp * sinf(inv_freq);
 
             mixed_query[offset + lane] = (__half)q;
-            key_layer[offset + lane] = (__half)k;
+            key_layer[k_offset + lane] = (__half)k;
 
             lane += WARP_SIZE;
         }
@@ -237,6 +240,7 @@ template void launch_apply_rotary_pos_emb<__half>(__half*,
                                                   bool,
                                                   bool,
                                                   cudaStream_t);
+
 /*
 __global__ void apply_rotary_pos_emb(float* mixed_query,
 float* key_layer,
 
@@ -317,12 +317,18 @@ __global__ void gptj_residual_add(float* input,
         float4 out = output_cast[offset];
         float4 res_vec = attn_cast[offset];
         float4 bias_data = bias_cast[offset % intermediate_size];
-        float4 attn_bias = attnbias_cast[offset % intermediate_size];
 
-        data.x = data.x * mp_size + (out.x + res_vec.x + bias_data.x + attn_bias.x);
-        data.y = data.y * mp_size + (out.y + res_vec.y + bias_data.y + attn_bias.y);
-        data.z = data.z * mp_size + (out.z + res_vec.z + bias_data.z + attn_bias.z);
-        data.w = data.w * mp_size + (out.w + res_vec.w + bias_data.w + attn_bias.w);
+        if (attnbias) {
+            float4 attn_bias = attnbias_cast[offset % intermediate_size];
+            data.x += attn_bias.x;
+            data.y += attn_bias.y;
+            data.z += attn_bias.z;
+            data.w += attn_bias.w;
+        }
+        data.x = data.x * mp_size + (out.x + res_vec.x + bias_data.x);
+        data.y = data.y * mp_size + (out.y + res_vec.y + bias_data.y);
+        data.z = data.z * mp_size + (out.z + res_vec.z + bias_data.z);
+        data.w = data.w * mp_size + (out.w + res_vec.w + bias_data.w);
 
         output_cast[offset] = data;
     }
@@ -354,13 +360,11 @@ __global__ void gptj_residual_add(__half* input,
         float2 res_vec = attn_cast[offset];
 
         float2 bias_vec = bias_cast[offset % intermediate_size];
-        float2 attn_bias_vec = attnbias_cast[offset % intermediate_size];
 
         __half2* vals_half = reinterpret_cast<__half2*>(&vals_vec);
         __half2* out_half = reinterpret_cast<__half2*>(&out_vec);
         __half2* res_half = reinterpret_cast<__half2*>(&res_vec);
         __half2* bias_half = reinterpret_cast<__half2*>(&bias_vec);
-        __half2* attnbias_half = reinterpret_cast<__half2*>(&attn_bias_vec);
 
         float2 low_data = __half22float2(vals_half[0]);
         float2 high_data = __half22float2(vals_half[1]);
@@ -373,18 +377,21 @@ __global__ void gptj_residual_add(__half* input,
 
         float2 low_bias = __half22float2(bias_half[0]);
         float2 high_bias = __half22float2(bias_half[1]);
-
-        float2 attn_low_bias = __half22float2(attnbias_half[0]);
-        float2 attn_high_bias = __half22float2(attnbias_half[1]);
-
-        low_data.x =
-            low_data.x * mp_size + (low_out.x + low_res.x + (low_bias.x + attn_low_bias.x));
-        low_data.y =
-            low_data.y * mp_size + (low_out.y + low_res.y + (low_bias.y + attn_low_bias.y));
-        high_data.x =
-            high_data.x * mp_size + (high_out.x + high_res.x + (high_bias.x + attn_high_bias.x));
-        high_data.y =
-            high_data.y * mp_size + (high_out.y + high_res.y + (high_bias.y + attn_high_bias.y));
+        if (attn_bias) {
+            float2 attn_bias_vec = attnbias_cast[offset % intermediate_size];
+            __half2* attnbias_half = reinterpret_cast<__half2*>(&attn_bias_vec);
+            float2 attn_low_bias = __half22float2(attnbias_half[0]);
+            float2 attn_high_bias = __half22float2(attnbias_half[1]);
+            low_data.x += attn_low_bias.x;
+            low_data.y += attn_low_bias.y;
+            high_data.x += attn_high_bias.x;
+            high_data.y += attn_high_bias.y;
+        }
+
+        low_data.x = low_data.x * mp_size + (low_out.x + low_res.x + (low_bias.x));
+        low_data.y = low_data.y * mp_size + (low_out.y + low_res.y + (low_bias.y));
+        high_data.x = high_data.x * mp_size + (high_out.x + high_res.x + (high_bias.x));
+        high_data.y = high_data.y * mp_size + (high_out.y + high_res.y + (high_bias.y));
 
         vals_half[0] = __float22half2_rn(low_data);
         vals_half[1] = __float22half2_rn(high_data);