Skip to content

Conversation

manish-jangra
Copy link
Contributor

This commit will do the following -

  • Create Power 10 Machines
  • Create 2TB Volume for each Power Machine

Copy link
Contributor

Code Review by Gemini

--- a/components/multi-platform-controller/production/stone-prd-rh01/host-config.yaml
+++ b/components/multi-platform-controller/production/stone-prd-rh01/host-config.yaml
@@ -665,43 +665,43 @@ data:
   host.s390x-static-15.secret: "ibm-s390x-static-ssh-key"
   host.s390x-static-15.concurrency: "4"
 
   host.s390x-static-16.address: "10.249.67.13"
   host.s390x-static-16.platform: "linux/s390x"
   host.s390x-static-16.user: "root"
   host.s390x-static-16.secret: "ibm-s390x-static-ssh-key"
   host.s390x-static-16.concurrency: "4"
 
   # PPC64LE 4cores(32vCPU) / 128GiB RAM / 2TB disk
   host.ppc64le-static-1.address: "10.244.0.57"
   host.ppc64le-static-1.platform: "linux/ppc64le"
   host.ppc64le-static-1.user: "root"
-  host.ppc64le-static-1.secret: "ibm-production-ppc64le-ssh-key"
+  host.ppc64le-static-1.secret: "ibm-ppc64le-ssh-key-wkjg"
   host.ppc64le-static-1.concurrency: "8"
 
   host.ppc64le-static-2.address: "10.244.0.4"
   host.ppc64le-static-2.platform: "linux/ppc64le"
   host.ppc64le-static-2.user: "root"
-  host.ppc64le-static-2.secret: "ibm-production-ppc64le-ssh-key"
+  host.ppc64le-static-2.secret: "ibm-ppc64le-ssh-key-wkjg"
   host.ppc64le-static-2.concurrency: "8"
 
   host.ppc64le-static-3.address: "10.244.0.14"
   host.ppc64le-static-3.platform: "linux/ppc64le"
   host.ppc64le-static-3.user: "root"
-  host.ppc64le-static-3.secret: "ibm-production-ppc64le-ssh-key"
+  host.ppc64le-static-3.secret: "ibm-ppc64le-ssh-key-wkjg"
   host.ppc64le-static-3.concurrency: "8"
 
   host.ppc64le-static-4.address: "10.244.0.6"
   host.ppc64le-static-4.platform: "linux/ppc64le"
   host.ppc64le-static-4.user: "root"
-  host.ppc64le-static-4.secret: "ibm-production-ppc64le-ssh-key"
+  host.ppc64le-static-4.secret: "ibm-ppc64le-ssh-key-wkjg"
   host.ppc64le-static-4.concurrency: "8"
 
   host.ppc64le-static-5.address: "10.244.0.48"
   host.ppc64le-static-5.platform: "linux/ppc64le"
   host.ppc64le-static-5.user: "root"
-  host.ppc64le-static-5.secret: "ibm-production-ppc64le-ssh-key"
+  host.ppc64le-static-5.secret: "ibm-ppc64le-ssh-key-wkjg"
   host.ppc64le-static-5.concurrency: "8"
 
   host.ppc64le-static-6.address: "10.244.0.46"
   host.ppc64le-static-6.platform: "linux/ppc64le"
   host.ppc64le-static-6.user: "root"
-  host.ppc64le-static-6.secret: "ibm-production-ppc64le-ssh-key"
+  host.ppc64le-static-6.secret: "ibm-ppc64le-ssh-key-wkjg"
   host.ppc64le-static-6.concurrency: "8"
 
   host.ppc64le-static-7.address: "10.244.0.33"
   host.ppc64le-static-7.platform: "linux/ppc64le"
   host.ppc64le-static-7.user: "root"
-  host.ppc64le-static-7.secret: "ibm-production-ppc64le-ssh-key"
+  host.ppc64le-static-7.secret: "ibm-ppc64le-ssh-key-wkjg"
   host.ppc64le-static-7.concurrency: "8"
 
   host.ppc64le-static-8.address: "10.244.0.54"
   host.ppc64le-static-8.platform: "linux/ppc64le"
   host.ppc64le-static-8.user: "root"
-  host.ppc64le-static-8.secret: "ibm-production-ppc64le-ssh-key"
+  host.ppc64le-static-8.secret: "ibm-ppc64le-ssh-key-wkjg"
   host.ppc64le-static-8.concurrency: "8"
 
 # GPU Instances
   dynamic.linux-g6xlarge-amd64.type: aws
   dynamic.linux-g6xlarge-amd64.region: us-east-1
   dynamic.linux-g6xlarge-amd64.ami: ami-0ad6c6b0ac6c36199
   dynamic.linux-g6xlarge-amd64.instance-type: g6.xlarge

Copy link
Contributor

Code Review by Gemini

The following issues and improvements are identified in the provided code changes:

components/multi-platform-controller/production/stone-prd-rh01/external-secrets.yaml

Issue: Orphaned ExternalSecret
The host-config.yaml file has been updated to use the new ibm-ppc64le-ssh-key-wkjg secret for all ppc64le-static hosts. However, the ibm-production-ppc64le-ssh-key ExternalSecret is still present in external-secrets.yaml but is no longer referenced by any host in host-config.yaml. This creates an orphaned resource that is not being used and should be removed for cleanup and to avoid confusion.

Suggested Change: Remove the unused ExternalSecret for ibm-production-ppc64le-ssh-key.

--- a/components/multi-platform-controller/production/stone-prd-rh01/external-secrets.yaml
+++ b/components/multi-platform-controller/production/stone-prd-rh01/external-secrets.yaml
@@ -84,6 +84,29 @@
   target:
     creationPolicy: Owner
     deletionPolicy: Delete
     name: ibm-production-s390x-static-ssh-key
 ---
 apiVersion: external-secrets.io/v1beta1
 kind: ExternalSecret
-metadata:
-  name: ibm-production-ppc64le-ssh-key
-  namespace: multi-platform-controller
-  labels:
-    build.appstudio.redhat.com/multi-platform-secret: "true"
-  annotations:
-    argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true
-    argocd.argoproj.io/sync-wave: "-1"
-spec:
-  dataFrom:
-    - extract:
-        key: production/platform/terraform/generated/stone-prd-rh01/ibm-production-ppc64le-ssh-key
-  refreshInterval: 1h
-  secretStoreRef:
-    kind: ClusterSecretStore
-    name: appsre-stonesoup-vault
-  target:
-    creationPolicy: Owner
-    deletionPolicy: Delete
-    name: ibm-production-ppc64le-ssh-key
 ---
 apiVersion: external-secrets.io/v1beta1
 kind: ExternalSecret
+metadata:
+  name: ibm-ppc64le-ssh-key-wkjg
+  namespace: multi-platform-controller
+  labels:
+    build.appstudio.redhat.com/multi-platform-secret: "true"
+  annotations:
+    argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true
+    argocd.argoproj.io/sync-wave: "-1"
+spec:
+  dataFrom:
+    - extract:
+        key: production/platform/terraform/generated/stone-prd-rh01/ibm-ppc64le-ssh-key-wkjg
+  refreshInterval: 1h
+  secretStoreRef:
+    kind: ClusterSecretStore
+    name: appsre-stonesoup-vault
+  target:
+    creationPolicy: Owner
+    deletionPolicy: Delete
+    name: ibm-ppc64le-ssh-key-wkjg
+---
+apiVersion: external-secrets.io/v1beta1
+kind: ExternalSecret
 metadata:
   name: ibm-production-s390x-ssh-key
   namespace: multi-platform-controller
   labels:
     build.appstudio.redhat.com/multi-platform-secret: "true"
   annotations:
     argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true
     argocd.argoproj.io/sync-wave: "-1"
 spec:
   dataFrom:

@@ -115,6 +115,29 @@ spec:
---
apiVersion: external-secrets.io/v1beta1
kind: ExternalSecret
metadata:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we never remove old keys? Sometimes we have 3-4 ssh keys secrets on the cluster and no idea which is current.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good point, we should remove them when they are replaced.

Copy link

openshift-ci bot commented Sep 12, 2025

[APPROVALNOTIFIER] This PR is APPROVED

This pull-request has been approved by: manish-jangra, mshaposhnik

The full list of commands accepted by this bot can be found here.

The pull request process is described here

Needs approval from an approver in each of these files:

Approvers can indicate their approval by writing /approve in a comment
Approvers can cancel approval by writing /approve cancel in a comment

This commit will do the following -
- Create Power 10 Machines
- Create 2TB Volume for each Power Machine
Copy link
Contributor

Code Review by Gemini

## Code Review

### Bugs and Issues

*   **Potential Breaking Change in RBAC Policy:**
    The new `validate-restrict-binding-system-authenticated-clusterpolicy.yaml` policy restricts the `enterprisecontractpolicy-viewer-role` from being bound to the `system:authenticated` group within tenant namespaces. This was previously allowed for the `rhtap-releng-tenant` namespace by the `validate-restrict-binding-system-authenticated-releng-clusterpolicy.yaml` policy. This change could potentially break existing configurations or workflows that rely on `system:authenticated` having this role in tenant namespaces. Please confirm if this is the intended behavior.

    **File:** `components/policies/production/base/konflux-rbac/restrict-binding-system-authenticated/validate-restrict-binding-system-authenticated-clusterpolicy.yaml`
    **Line:**
    ```diff
    --- a/components/policies/production/base/konflux-rbac/restrict-binding-sysauth/validate-restrict-binding-sysauth-clusterpolicy.yaml
    +++ b/components/policies/production/base/konflux-rbac/restrict-binding-system-authenticated/validate-restrict-binding-system-authenticated-clusterpolicy.yaml
    @@ -34,7 +34,6 @@
         operator: Equals
         value: "ClusterRole"
       - key: "{{ request.object.roleRef.name }}"
    -    operator: NotEquals
    -    value: konflux-viewer-user-actions
    -    operator: AllNotIn
    -    value:
    -      - konflux-viewer-user-actions
    -      - enterprisecontractpolicy-viewer-role
    +    operator: NotEquals
    +    value: konflux-viewer-user-actions
     validate:
       allowExistingViolations: true
       failureAction: Enforce
    -  message: "Only ClusterRole 'konflux-viewer-user-actions' can be bound to 'system:authenticated' in tenant namespaces."
    +  message: "Only ClusterRole 'konflux-viewer-user-actions' can be bound to 'system:authenticated' in tenant namespaces."
    ```
    The change from `operator: AllNotIn` with two allowed roles to `operator: NotEquals` with only one allowed role means `enterprisecontractpolicy-viewer-role` is no longer an exception.

### Improvements

*   **Loki Ruler Configuration Clarity:**
    The Loki ruler component is now explicitly configured and enabled in production and staging environments, and disabled in development. This is a significant change from previous configurations where it was explicitly disabled with comments stating it was "not needed". Adding a comment to the `loki-helm-prod-values.yaml` and `loki-helm-stg-values.yaml` files explaining the purpose of enabling the ruler (e.g., for alert management, recording rules) would improve maintainability and understanding for future reviewers.

    **File:** `components/vector-kubearchive-log-collector/production/stone-prod-p02/loki-helm-prod-values.yaml`
    **Line:**
    ```diff
    --- a/components/vector-kubearchive-log-collector/production/stone-prod-p02/loki-helm-prod-values.yaml
    +++ b/components/vector-kubearchive-log-collector/production/stone-prod-p02/loki-helm-prod-values.yaml
    @@ -37,6 +37,9 @@
     query_range:
       # split_queries_by_interval deprecated in Loki 3.x - removed
       parallelise_shardable_queries: true
    +  # Configure ruler storage to use local filesystem
    +  ruler:
    +    rule_path: /var/loki/ruler
    +    storage:
    +      type: local
    +      local:
    +        directory: /var/loki/ruler
     
     # Distributed components configuration
     ingester:
    @@ -137,6 +140,12 @@
         memory: 1Gi
       affinity: {}
     
    +ruler:
    +  replicas: 1
    +  resources:
    +    requests:
    +      cpu: 100m
    +      memory: 256Mi
    +    limits:
    +      memory: 512Mi
    +
     # Enable Memcached caches for performance
     chunksCache:
       enabled: true
    ```
    Consider adding a comment like `# Enable Loki ruler for [specific purpose, e.g., alert management, recording rules]` above the `loki.ruler` block or the `ruler:` block. The same applies to `components/vector-kubearchive-log-collector/staging/stone-stg-rh01/loki-helm-stg-values.yaml`.

Copy link
Contributor

Code Review by Gemini

```diff
--- a/components/multi-platform-controller/production/stone-prd-rh01/external-secrets.yaml
+++ b/components/multi-platform-controller/production/stone-prd-rh01/external-secrets.yaml
@@ -86,39 +86,39 @@ spec:
     kind: ClusterSecretStore
     name: appsre-stonesoup-vault
   target:
     creationPolicy: Owner
     deletionPolicy: Delete
     name: aws-account
 ---
 apiVersion: external-secrets.io/v1beta1
 kind: ExternalSecret
 metadata:
-  name: ibm-production-ppc64le-ssh-key
+  name: ibm-ppc64le-ssh-key-wkjg
   namespace: multi-platform-controller
   labels:
     build.appstudio.redhat.com/multi-platform-secret: "true"
   annotations:
     argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true
     argocd.argoproj.io/sync-wave: "-1"
 spec:
   dataFrom:
     - extract:
-        key: production/build/multi-platform-controller/ibm-production-ppc64le-ssh-key
+        key: production/platform/terraform/generated/stone-prd-rh01/ibm-ppc64le-ssh-key-wkjg
   refreshInterval: 1h
   secretStoreRef:
     kind: ClusterSecretStore
     name: appsre-stonesoup-vault
   target:
     creationPolicy: Owner
     deletionPolicy: Delete
-    name: ibm-production-ppc64le-ssh-key
+    name: ibm-ppc64le-ssh-key-wkjg
 ---

Issue: External Secret Source Verification

The dataFrom.extract.key path for the ExternalSecret has been changed significantly. This new path points to a different location in Vault. It is crucial to ensure that the secret at the new Vault path exists and contains the correct SSH key. If the secret is not correctly provisioned at production/platform/terraform/generated/stone-prd-rh01/ibm-ppc64le-ssh-key-wkjg, the ExternalSecret will fail to sync, and the Kubernetes Secret will not be created, leading to connection failures for the ppc64le-static hosts.

Suggested Improvement:
Verify that the secret ibm-ppc64le-ssh-key-wkjg is correctly provisioned in Vault at the path production/platform/terraform/generated/stone-prd-rh01/ibm-ppc64le-ssh-key-wkjg before deploying this change.

--- a/components/multi-platform-controller/production/stone-prd-rh01/external-secrets.yaml
+++ b/components/multi-platform-controller/production/stone-prd-rh01/external-secrets.yaml
@@ -99,7 +99,7 @@
 spec:
   dataFrom:
     - extract:
-        key: production/build/multi-platform-controller/ibm-production-ppc64le-ssh-key
+        key: production/platform/terraform/generated/stone-prd-rh01/ibm-ppc64le-ssh-key-wkjg
   refreshInterval: 1h
   secretStoreRef:
     kind: ClusterSecretStore

@mshaposhnik
Copy link
Contributor

/lgtm

@openshift-ci openshift-ci bot added the lgtm label Sep 15, 2025
@openshift-merge-bot openshift-merge-bot bot merged commit 13b7f90 into redhat-appstudio:main Sep 15, 2025
7 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants