- to: ${{ steps.codeowners.outputs.owners || secrets.SECURITY_EMAIL_RECIPIENTS }}
- from: GitHub Actions
-
+ subject: |
+ ${{
+ failure() && '🚨 OpenFL Security Scan Failed' ||
+ steps.report.outputs.has_vulnerabilities == 'true' && '⚠️ OpenFL Vulnerabilities Found' ||
+ '✅ OpenFL Security Scan Passed'
+ }}
+ body: file://report.md
+ to: ${{ steps.codeowners.outputs.owners }}
+ from: "OpenFL Security Bot "
+ content_type: text/html
+ convert_markdown: true
+
+ # ============ ARTIFACT UPLOADS ============
+ - name: Upload scan artifacts
+ uses: actions/upload-artifact@v4
+ with:
+ name: security-reports-${{ github.run_id }}
+ path: |
+ trivy-fs-report.json
+ trivy-image-report.json
+ trivy-fs-sbom.json
+ trivy-image-sbom.json
+ report.md
+ retention-days: 30
+
+ # ============ FAILURE HANDLING ============
+ - name: Fail workflow if vulnerabilities found
+ if: steps.report.outputs.has_vulnerabilities == 'true' && github.event_name != 'schedule'
+ run: |
+ echo "::error::Critical/High vulnerabilities detected!"
+ exit 1
From 2d99f1e71c985a51b2aaf7ac2286f2b823c5c2d1 Mon Sep 17 00:00:00 2001
From: rajithkrishnegowda
<134698520+rajithkrishnegowda@users.noreply.github.com>
Date: Fri, 25 Apr 2025 11:26:10 +0530
Subject: [PATCH 06/34] Update CODEOWNERS
---
CODEOWNERS | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/CODEOWNERS b/CODEOWNERS
index cb0b89fc6b..b36cb38d58 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -30,6 +30,6 @@
/scripts/ aayush.garg@intel.com giribabu.bikki@intel.com karan.shah@intel.com patrick.foley@intel.com srikanth.enugula@intel.com teodor.parvanov@intel.com
# File level ownership
-CODEOWNERS aayush.garg@intel.com giribabu.bikki@intel.com patrick.foley@intel.com preethi.asokan@intel.com rahul.garg@intel.com srikanth.enugula@intel.com teodor.parvanov@intel.com
+CODEOWNERS akshay.pant@intel.com karan.shah@intel.com kevin.ta@intel.com noopur@intel.com patrick.foley@intel.com payal.chaurasiya@intel.com rahul.garg@intel.com rajith.krishnegowda@intel.com shailesh.pant@intel.com shailesh.tanwar@intel.com teodor.parvanov@intel.com
test-requirements.txt akshay.pant@intel.com karan.shah@intel.com kevin.ta@intel.com noopur@intel.com patrick.foley@intel.com payal.chaurasiya@intel.com rahul.garg@intel.com rajith.krishnegowda@intel.com shailesh.pant@intel.com shailesh.tanwar@intel.com teodor.parvanov@intel.com
From 522ad36cc150c709203b0fc0eb7aaeea880f434e Mon Sep 17 00:00:00 2001
From: rajithkrishnegowda
<134698520+rajithkrishnegowda@users.noreply.github.com>
Date: Fri, 25 Apr 2025 11:30:33 +0530
Subject: [PATCH 07/34] Update trivy.yml
---
.github/workflows/trivy.yml | 13 ++++++-------
1 file changed, 6 insertions(+), 7 deletions(-)
diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml
index 8e7c6c60ac..357a0d292a 100644
--- a/.github/workflows/trivy.yml
+++ b/.github/workflows/trivy.yml
@@ -144,14 +144,13 @@ jobs:
server_port: ${{ secrets.SMTP_PORT }}
username: ${{ secrets.SMTP_USERNAME }}
password: ${{ secrets.SMTP_PASSWORD }}
- subject: |
- ${{
- failure() && '🚨 OpenFL Security Scan Failed' ||
- steps.report.outputs.has_vulnerabilities == 'true' && '⚠️ OpenFL Vulnerabilities Found' ||
- '✅ OpenFL Security Scan Passed'
- }}
+ subject: ${{
+ (failure() && '🚨 OpenFL Security Scan Failed') ||
+ (steps.report.outputs.has_vulnerabilities == 'true' && '⚠️ OpenFL Vulnerabilities Found') ||
+ '✅ OpenFL Security Scan Passed'
+ }}
body: file://report.md
- to: ${{ steps.codeowners.outputs.owners }}
+ to: ${{ steps.codeowners.outputs.emails }}
from: "OpenFL Security Bot "
content_type: text/html
convert_markdown: true
From eaa2bcc1bea16c18b25366df9fd2435d581e7315 Mon Sep 17 00:00:00 2001
From: rajithkrishnegowda
<134698520+rajithkrishnegowda@users.noreply.github.com>
Date: Fri, 25 Apr 2025 11:33:31 +0530
Subject: [PATCH 08/34] Update trivy.yml
---
.github/workflows/trivy.yml | 28 ++++++++++++++++------------
1 file changed, 16 insertions(+), 12 deletions(-)
diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml
index 357a0d292a..124f32e725 100644
--- a/.github/workflows/trivy.yml
+++ b/.github/workflows/trivy.yml
@@ -17,7 +17,7 @@ jobs:
security-events: write
actions: read
packages: read
- issues: write # Needed for creating issues if vulnerabilities found
+ issues: write
runs-on: ubuntu-22.04
timeout-minutes: 45
env:
@@ -33,7 +33,7 @@ jobs:
fetch-depth: 0
# ============ SCANNING PHASE ============
- - name: Run security scans
+ - name: Run filesystem scan
uses: aquasecurity/trivy-action@0.30.0
with:
scan-type: 'fs'
@@ -89,7 +89,7 @@ jobs:
echo -e "\n| Severity | ID | Package | Version | Description |" >> report.md
echo "|----------|----|---------|---------|-------------|" >> report.md
jq -r '.Results[]?.Vulnerabilities[]? | "| \(.Severity) | \(.VulnerabilityID) | \(.PkgName) | \(.InstalledVersion) | \(.Title) |"' trivy-fs-report.json >> report.md
- echo "::set-output name=has_vulnerabilities::true"
+ echo "has_vulnerabilities=true" >> $GITHUB_OUTPUT
fi
fi
@@ -103,7 +103,7 @@ jobs:
echo -e "\n| Severity | ID | Package | Version | Description |" >> report.md
echo "|----------|----|---------|---------|-------------|" >> report.md
jq -r '.Results[]?.Vulnerabilities[]? | "| \(.Severity) | \(.VulnerabilityID) | \(.PkgName) | \(.InstalledVersion) | \(.Title) |"' trivy-image-report.json >> report.md
- echo "::set-output name=has_vulnerabilities::true"
+ echo "has_vulnerabilities=true" >> $GITHUB_OUTPUT
fi
fi
@@ -116,19 +116,27 @@ jobs:
cat report.md
# ============ NOTIFICATION PHASE ============
+ - name: Set notification subject
+ id: set-subject
+ run: |
+ if [[ "${{ job.status }}" == "failure" ]]; then
+ echo "subject=🚨 OpenFL Security Scan Failed" >> $GITHUB_OUTPUT
+ elif [[ "${{ steps.report.outputs.has_vulnerabilities }}" == "true" ]]; then
+ echo "subject=⚠️ OpenFL Vulnerabilities Found" >> $GITHUB_OUTPUT
+ else
+ echo "subject=✅ OpenFL Security Scan Passed" >> $GITHUB_OUTPUT
+ fi
+
- name: Extract CODEOWNERS emails
id: codeowners
run: |
- # Install Python if not already present
if ! command -v python &> /dev/null; then
sudo apt-get update && sudo apt-get install -y python3
fi
- # Run the Python script
OUTPUT=$(python .github/scripts/extract_emails.py)
echo "Extracted emails: $OUTPUT"
- # Parse JSON output and set outputs
EMAILS=$(echo "$OUTPUT" | jq -r '.emails | join(",")')
echo "emails=${EMAILS:-${{ secrets.SECURITY_EMAIL_RECIPIENTS }}}" >> $GITHUB_OUTPUT
echo "codeowners_path=$(echo "$OUTPUT" | jq -r '.codeowners_path')" >> $GITHUB_OUTPUT
@@ -144,11 +152,7 @@ jobs:
server_port: ${{ secrets.SMTP_PORT }}
username: ${{ secrets.SMTP_USERNAME }}
password: ${{ secrets.SMTP_PASSWORD }}
- subject: ${{
- (failure() && '🚨 OpenFL Security Scan Failed') ||
- (steps.report.outputs.has_vulnerabilities == 'true' && '⚠️ OpenFL Vulnerabilities Found') ||
- '✅ OpenFL Security Scan Passed'
- }}
+ subject: ${{ steps.set-subject.outputs.subject }}
body: file://report.md
to: ${{ steps.codeowners.outputs.emails }}
from: "OpenFL Security Bot "
From 9601e1d77ae51a0570e26ca69b4b0734c80619d3 Mon Sep 17 00:00:00 2001
From: rajithkrishnegowda
<134698520+rajithkrishnegowda@users.noreply.github.com>
Date: Fri, 25 Apr 2025 11:43:55 +0530
Subject: [PATCH 09/34] Update trivy.yml
---
.github/workflows/trivy.yml | 125 ++++--------------------------------
1 file changed, 12 insertions(+), 113 deletions(-)
diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml
index 124f32e725..4279b4beed 100644
--- a/.github/workflows/trivy.yml
+++ b/.github/workflows/trivy.yml
@@ -25,95 +25,7 @@ jobs:
COMMIT_ID: ${{ inputs.commit_id || github.sha }}
steps:
- # ============ SETUP PHASE ============
- - name: Checkout repository
- uses: actions/checkout@v4
- with:
- ref: ${{ env.COMMIT_ID }}
- fetch-depth: 0
-
- # ============ SCANNING PHASE ============
- - name: Run filesystem scan
- uses: aquasecurity/trivy-action@0.30.0
- with:
- scan-type: 'fs'
- format: 'json'
- output: 'trivy-fs-report.json'
- severity: 'CRITICAL,HIGH'
- ignore-unfixed: true
- vuln-type: 'os,library'
- security-checks: 'vuln'
-
- - name: Build Docker image
- run: |
- docker buildx build \
- --pull \
- --tag local/scan-target:${{ github.run_id }} \
- --file openfl-docker/Dockerfile.base \
- --load \
- .
-
- - name: Scan Docker image
- uses: aquasecurity/trivy-action@0.30.0
- with:
- image-ref: 'local/scan-target:${{ github.run_id }}'
- format: 'json'
- output: 'trivy-image-report.json'
- severity: 'CRITICAL,HIGH'
- ignore-unfixed: true
- vuln-type: 'os,library'
- security-checks: 'vuln'
-
- # ============ REPORTING PHASE ============
- - name: Generate SBOM reports
- run: |
- trivy fs --format spdx-json --output trivy-fs-sbom.json .
- trivy image --format spdx-json --output trivy-image-sbom.json local/scan-target:${{ github.run_id }}
-
- - name: Create consolidated report
- id: report
- run: |
- # Initialize markdown report
- echo "# Security Scan Report - OpenFL" > report.md
- echo "**Scan Date:** $(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> report.md
- echo "**Commit:** [${{ env.COMMIT_ID }}](https://github.com/rajithkrishnegowda/openfl/commit/${{ env.COMMIT_ID }})" >> report.md
- echo -e "\n## Vulnerability Summary\n" >> report.md
-
- # Process filesystem results
- if [ -f "trivy-fs-report.json" ]; then
- FS_VULNS=$(jq '[.Results[]?.Vulnerabilities[]?] | length' trivy-fs-report.json || echo 0)
- echo "### Filesystem Scans" >> report.md
- echo "**Critical/High Vulnerabilities:** $FS_VULNS" >> report.md
-
- if [ "$FS_VULNS" -gt 0 ]; then
- echo -e "\n| Severity | ID | Package | Version | Description |" >> report.md
- echo "|----------|----|---------|---------|-------------|" >> report.md
- jq -r '.Results[]?.Vulnerabilities[]? | "| \(.Severity) | \(.VulnerabilityID) | \(.PkgName) | \(.InstalledVersion) | \(.Title) |"' trivy-fs-report.json >> report.md
- echo "has_vulnerabilities=true" >> $GITHUB_OUTPUT
- fi
- fi
-
- # Process image results
- if [ -f "trivy-image-report.json" ]; then
- IMG_VULNS=$(jq '[.Results[]?.Vulnerabilities[]?] | length' trivy-image-report.json || echo 0)
- echo -e "\n### Container Image Scans" >> report.md
- echo "**Critical/High Vulnerabilities:** $IMG_VULNS" >> report.md
-
- if [ "$IMG_VULNS" -gt 0 ]; then
- echo -e "\n| Severity | ID | Package | Version | Description |" >> report.md
- echo "|----------|----|---------|---------|-------------|" >> report.md
- jq -r '.Results[]?.Vulnerabilities[]? | "| \(.Severity) | \(.VulnerabilityID) | \(.PkgName) | \(.InstalledVersion) | \(.Title) |"' trivy-image-report.json >> report.md
- echo "has_vulnerabilities=true" >> $GITHUB_OUTPUT
- fi
- fi
-
- # Add artifact download links
- echo -e "\n## Next Steps\n" >> report.md
- echo "1. Review the full reports in the workflow artifacts" >> report.md
- echo "2. Address critical vulnerabilities immediately" >> report.md
- echo "3. Create GitHub issues for tracking remediation" >> report.md
-
- cat report.md
+ # [Previous steps remain unchanged...]
# ============ NOTIFICATION PHASE ============
- name: Set notification subject
@@ -139,11 +51,17 @@ jobs:
EMAILS=$(echo "$OUTPUT" | jq -r '.emails | join(",")')
echo "emails=${EMAILS:-${{ secrets.SECURITY_EMAIL_RECIPIENTS }}}" >> $GITHUB_OUTPUT
- echo "codeowners_path=$(echo "$OUTPUT" | jq -r '.codeowners_path')" >> $GITHUB_OUTPUT
env:
PYTHONIOENCODING: utf-8
+ - name: Convert report to HTML
+ id: convert-report
+ run: |
+ pip install markdown
+ python -c "import markdown; print(markdown.markdown(open('report.md').read()))" > report.html
+ echo "html_body=$(cat report.html)" >> $GITHUB_OUTPUT
+
- name: Send email notification
if: always() && (steps.report.outputs.has_vulnerabilities == 'true' || failure())
uses: dawidd6/action-send-mail@v3
@@ -153,28 +71,9 @@ jobs:
username: ${{ secrets.SMTP_USERNAME }}
password: ${{ secrets.SMTP_PASSWORD }}
subject: ${{ steps.set-subject.outputs.subject }}
- body: file://report.md
+ html_body: ${{ steps.convert-report.outputs.html_body }}
to: ${{ steps.codeowners.outputs.emails }}
- from: "OpenFL Security Bot "
- content_type: text/html
- convert_markdown: true
+ from: OpenFL Security Bot
+ convert_markdown: false
- # ============ ARTIFACT UPLOADS ============
- - name: Upload scan artifacts
- uses: actions/upload-artifact@v4
- with:
- name: security-reports-${{ github.run_id }}
- path: |
- trivy-fs-report.json
- trivy-image-report.json
- trivy-fs-sbom.json
- trivy-image-sbom.json
- report.md
- retention-days: 30
-
- # ============ FAILURE HANDLING ============
- - name: Fail workflow if vulnerabilities found
- if: steps.report.outputs.has_vulnerabilities == 'true' && github.event_name != 'schedule'
- run: |
- echo "::error::Critical/High vulnerabilities detected!"
- exit 1
+ # [Remaining steps remain unchanged...]
From 7ec5b90c616349c8951774a768f742c916358396 Mon Sep 17 00:00:00 2001
From: rajithkrishnegowda
<134698520+rajithkrishnegowda@users.noreply.github.com>
Date: Fri, 25 Apr 2025 11:47:20 +0530
Subject: [PATCH 10/34] Update trivy.yml
---
.github/workflows/trivy.yml | 110 +++++++++++++++++++++++++++++++++++-
1 file changed, 108 insertions(+), 2 deletions(-)
diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml
index 4279b4beed..7b550fb18a 100644
--- a/.github/workflows/trivy.yml
+++ b/.github/workflows/trivy.yml
@@ -25,7 +25,95 @@ jobs:
COMMIT_ID: ${{ inputs.commit_id || github.sha }}
steps:
- # [Previous steps remain unchanged...]
+ # ============ SETUP PHASE ============
+ - name: Checkout repository
+ uses: actions/checkout@v4
+ with:
+ ref: ${{ env.COMMIT_ID }}
+ fetch-depth: 0
+
+ # ============ SCANNING PHASE ============
+ - name: Run filesystem scan
+ uses: aquasecurity/trivy-action@0.30.0
+ with:
+ scan-type: 'fs'
+ format: 'json'
+ output: 'trivy-fs-report.json'
+ severity: 'CRITICAL,HIGH'
+ ignore-unfixed: true
+ vuln-type: 'os,library'
+ security-checks: 'vuln'
+
+ - name: Build Docker image
+ run: |
+ docker buildx build \
+ --pull \
+ --tag local/scan-target:${{ github.run_id }} \
+ --file openfl-docker/Dockerfile.base \
+ --load \
+ .
+
+ - name: Scan Docker image
+ uses: aquasecurity/trivy-action@0.30.0
+ with:
+ image-ref: 'local/scan-target:${{ github.run_id }}'
+ format: 'json'
+ output: 'trivy-image-report.json'
+ severity: 'CRITICAL,HIGH'
+ ignore-unfixed: true
+ vuln-type: 'os,library'
+ security-checks: 'vuln'
+
+ # ============ REPORTING PHASE ============
+ - name: Generate SBOM reports
+ run: |
+ trivy fs --format spdx-json --output trivy-fs-sbom.json .
+ trivy image --format spdx-json --output trivy-image-sbom.json local/scan-target:${{ github.run_id }}
+
+ - name: Create consolidated report
+ id: report
+ run: |
+ # Initialize markdown report
+ echo "# Security Scan Report - OpenFL" > report.md
+ echo "**Scan Date:** $(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> report.md
+ echo "**Commit:** [${{ env.COMMIT_ID }}](https://github.com/rajithkrishnegowda/openfl/commit/${{ env.COMMIT_ID }})" >> report.md
+ echo -e "\n## Vulnerability Summary\n" >> report.md
+
+ # Process filesystem results
+ if [ -f "trivy-fs-report.json" ]; then
+ FS_VULNS=$(jq '[.Results[]?.Vulnerabilities[]?] | length' trivy-fs-report.json || echo 0)
+ echo "### Filesystem Scans" >> report.md
+ echo "**Critical/High Vulnerabilities:** $FS_VULNS" >> report.md
+
+ if [ "$FS_VULNS" -gt 0 ]; then
+ echo -e "\n| Severity | ID | Package | Version | Description |" >> report.md
+ echo "|----------|----|---------|---------|-------------|" >> report.md
+ jq -r '.Results[]?.Vulnerabilities[]? | "| \(.Severity) | \(.VulnerabilityID) | \(.PkgName) | \(.InstalledVersion) | \(.Title) |"' trivy-fs-report.json >> report.md
+ echo "has_vulnerabilities=true" >> $GITHUB_OUTPUT
+ fi
+ fi
+
+ # Process image results
+ if [ -f "trivy-image-report.json" ]; then
+ IMG_VULNS=$(jq '[.Results[]?.Vulnerabilities[]?] | length' trivy-image-report.json || echo 0)
+ echo -e "\n### Container Image Scans" >> report.md
+ echo "**Critical/High Vulnerabilities:** $IMG_VULNS" >> report.md
+
+ if [ "$IMG_VULNS" -gt 0 ]; then
+ echo -e "\n| Severity | ID | Package | Version | Description |" >> report.md
+ echo "|----------|----|---------|---------|-------------|" >> report.md
+ jq -r '.Results[]?.Vulnerabilities[]? | "| \(.Severity) | \(.VulnerabilityID) | \(.PkgName) | \(.InstalledVersion) | \(.Title) |"' trivy-image-report.json >> report.md
+ echo "has_vulnerabilities=true" >> $GITHUB_OUTPUT
+ fi
+ fi
+
+ # Add artifact download links
+ echo -e "\n## Next Steps\n" >> report.md
+ echo "1. Review the full reports in the workflow artifacts" >> report.md
+ echo "2. Address critical vulnerabilities immediately" >> report.md
+ echo "3. Create GitHub issues for tracking remediation" >> report.md
+
+ cat report.md
# ============ NOTIFICATION PHASE ============
- name: Set notification subject
@@ -76,4 +164,22 @@ jobs:
from: OpenFL Security Bot
convert_markdown: false
- # [Remaining steps remain unchanged...]
+ # ============ ARTIFACT UPLOADS ============
+ - name: Upload scan artifacts
+ uses: actions/upload-artifact@v4
+ with:
+ name: security-reports-${{ github.run_id }}
+ path: |
+ trivy-fs-report.json
+ trivy-image-report.json
+ trivy-fs-sbom.json
+ trivy-image-sbom.json
+ report.md
+ retention-days: 30
+
+ # ============ FAILURE HANDLING ============
+ - name: Fail workflow if vulnerabilities found
+ if: steps.report.outputs.has_vulnerabilities == 'true' && github.event_name != 'schedule'
+ run: |
+ echo "::error::Critical/High vulnerabilities detected!"
+ exit 1
From 447c9423a23e687491ae85af46445c4c86147570 Mon Sep 17 00:00:00 2001
From: rajithkrishnegowda
<134698520+rajithkrishnegowda@users.noreply.github.com>
Date: Fri, 25 Apr 2025 11:55:51 +0530
Subject: [PATCH 11/34] Update trivy.yml
---
.github/workflows/trivy.yml | 26 ++++++++++++++++----------
1 file changed, 16 insertions(+), 10 deletions(-)
diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml
index 7b550fb18a..0380bdf26b 100644
--- a/.github/workflows/trivy.yml
+++ b/.github/workflows/trivy.yml
@@ -143,23 +143,29 @@ jobs:
env:
PYTHONIOENCODING: utf-8
- - name: Convert report to HTML
- id: convert-report
+ - name: Prepare HTML email content
+ id: prepare-email
run: |
- pip install markdown
- python -c "import markdown; print(markdown.markdown(open('report.md').read()))" > report.html
- echo "html_body=$(cat report.html)" >> $GITHUB_OUTPUT
+ # Install markdown processor
+ python -m pip install markdown
+
+ # Convert markdown to HTML and properly escape for GITHUB_OUTPUT
+ HTML_CONTENT=$(python -c "import markdown, json; print(json.dumps(markdown.markdown(open('report.md').read())))")
+ echo "html_body=${HTML_CONTENT}" >> $GITHUB_OUTPUT
- name: Send email notification
if: always() && (steps.report.outputs.has_vulnerabilities == 'true' || failure())
uses: dawidd6/action-send-mail@v3
with:
- server_address: ${{ secrets.SMTP_SERVER }}
- server_port: ${{ secrets.SMTP_PORT }}
- username: ${{ secrets.SMTP_USERNAME }}
- password: ${{ secrets.SMTP_PASSWORD }}
+ # Try connection_url format if server_address fails
+ connection_url: ${{ secrets.SMTP_CONNECTION_URL }} # Format: "smtps://user:pass@host:port"
+ # Alternative if connection_url doesn't work:
+ # server_address: ${{ secrets.SMTP_SERVER }}
+ # server_port: ${{ secrets.SMTP_PORT }}
+ # username: ${{ secrets.SMTP_USERNAME }}
+ # password: ${{ secrets.SMTP_PASSWORD }}
subject: ${{ steps.set-subject.outputs.subject }}
- html_body: ${{ steps.convert-report.outputs.html_body }}
+ html_body: ${{ fromJSON(steps.prepare-email.outputs.html_body) }}
to: ${{ steps.codeowners.outputs.emails }}
from: OpenFL Security Bot
convert_markdown: false
From 68f986d127eb8d26133ee925f58dcb0c82fa7f40 Mon Sep 17 00:00:00 2001
From: rajithkrishnegowda
<134698520+rajithkrishnegowda@users.noreply.github.com>
Date: Fri, 25 Apr 2025 12:26:28 +0530
Subject: [PATCH 12/34] Update trivy.yml
---
.github/workflows/trivy.yml | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml
index 0380bdf26b..1d69915fb8 100644
--- a/.github/workflows/trivy.yml
+++ b/.github/workflows/trivy.yml
@@ -158,12 +158,12 @@ jobs:
uses: dawidd6/action-send-mail@v3
with:
# Try connection_url format if server_address fails
- connection_url: ${{ secrets.SMTP_CONNECTION_URL }} # Format: "smtps://user:pass@host:port"
+ #connection_url: ${{ secrets.SMTP_CONNECTION_URL }} # Format: "smtps://user:pass@host:port"
# Alternative if connection_url doesn't work:
- # server_address: ${{ secrets.SMTP_SERVER }}
- # server_port: ${{ secrets.SMTP_PORT }}
- # username: ${{ secrets.SMTP_USERNAME }}
- # password: ${{ secrets.SMTP_PASSWORD }}
+ server_address: ${{ secrets.SMTP_SERVER }}
+ server_port: ${{ secrets.SMTP_PORT }}
+ username: ${{ secrets.SMTP_USERNAME }}
+ password: ${{ secrets.SMTP_PASSWORD }}
subject: ${{ steps.set-subject.outputs.subject }}
html_body: ${{ fromJSON(steps.prepare-email.outputs.html_body) }}
to: ${{ steps.codeowners.outputs.emails }}
From 88e1769276831c129645540bed29477a52120241 Mon Sep 17 00:00:00 2001
From: rajithkrishnegowda
<134698520+rajithkrishnegowda@users.noreply.github.com>
Date: Fri, 25 Apr 2025 12:42:33 +0530
Subject: [PATCH 13/34] Update trivy.yml
---
.github/workflows/trivy.yml | 11 ++++-------
1 file changed, 4 insertions(+), 7 deletions(-)
diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml
index 1d69915fb8..5fe05b50db 100644
--- a/.github/workflows/trivy.yml
+++ b/.github/workflows/trivy.yml
@@ -155,20 +155,17 @@ jobs:
- name: Send email notification
if: always() && (steps.report.outputs.has_vulnerabilities == 'true' || failure())
- uses: dawidd6/action-send-mail@v3
+ uses: actions-hub/smtp@master
with:
- # Try connection_url format if server_address fails
- #connection_url: ${{ secrets.SMTP_CONNECTION_URL }} # Format: "smtps://user:pass@host:port"
- # Alternative if connection_url doesn't work:
- server_address: ${{ secrets.SMTP_SERVER }}
- server_port: ${{ secrets.SMTP_PORT }}
+ server: ${{ secrets.SMTP_SERVER }}
+ port: ${{ secrets.SMTP_PORT }}
username: ${{ secrets.SMTP_USERNAME }}
password: ${{ secrets.SMTP_PASSWORD }}
subject: ${{ steps.set-subject.outputs.subject }}
html_body: ${{ fromJSON(steps.prepare-email.outputs.html_body) }}
to: ${{ steps.codeowners.outputs.emails }}
from: OpenFL Security Bot
- convert_markdown: false
+ secure: true
# ============ ARTIFACT UPLOADS ============
- name: Upload scan artifacts
From 10e0bf4292ed522dd5453608d48cc64e79512e22 Mon Sep 17 00:00:00 2001
From: rajithkrishnegowda
<134698520+rajithkrishnegowda@users.noreply.github.com>
Date: Fri, 25 Apr 2025 13:04:37 +0530
Subject: [PATCH 14/34] Create send_email.py
---
.github/scripts/send_email.py | 97 +++++++++++++++++++++++++++++++++++
1 file changed, 97 insertions(+)
create mode 100644 .github/scripts/send_email.py
diff --git a/.github/scripts/send_email.py b/.github/scripts/send_email.py
new file mode 100644
index 0000000000..4c151e30fc
--- /dev/null
+++ b/.github/scripts/send_email.py
@@ -0,0 +1,97 @@
+import os
+import smtplib
+import logging
+import argparse
+from email.message import EmailMessage
+from email.mime.base import MIMEBase
+from email import encoders
+
+logger = logging.getLogger(__name__)
+
+def send_email(sender_email: str, to_email: str, subject: str, email_body: str, smtp_user: str, smtp_pwd: str,
+ smtp_email_server: str, cc_email: str = '', bcc_email: str = '', reply_email: str = '', is_html_body: bool = False,
+ attachments: str = '') -> None:
+
+ message = EmailMessage()
+ message["Subject"] = subject
+ message["From"] = sender_email
+ if to_email:
+ to_list = to_email.split(",")
+ message["To"] = ", ".join(to_list)
+ if cc_email:
+ cc_list = cc_email.split(",")
+ message["Cc"] = ", ".join(cc_list)
+ if reply_email:
+ message["Reply-To"] = reply_email
+ sub_type = 'plain'
+ if is_html_body:
+ sub_type = 'html'
+ message.set_content(email_body, subtype=sub_type)
+ # Set up attachment if any
+ if attachments:
+ for attachment in attachments.split(','):
+ with open(attachment, 'rb') as attachment_file:
+ attachment_data = attachment_file.read()
+ message.add_attachment(
+ attachment_data,
+ maintype='application',
+ subtype='octet-stream',
+ filename=os.path.basename(attachment)
+ )
+ logger.info(f'Setting smtp server {smtp_email_server}...')
+ smtp_server = smtplib.SMTP(smtp_email_server)
+ smtp_server.starttls()
+ smtp_server.login(smtp_user, smtp_pwd)
+ logger.info(f'smtp server authentication successful')
+ try:
+ logger.info(f'Sending email...')
+ if bcc_email:
+ # Send bcc list as an argument instead of adding it to the header to keep it hidden
+ bcc_list = bcc_email.split(",")
+ smtp_server.send_message(message, bcc=bcc_list)
+ else:
+ smtp_server.send_message(message)
+ logger.info(f'email sent.')
+ except Exception as ex:
+ raise ex
+ finally:
+ try:
+ smtp_server.quit()
+ except smtplib.SMTPServerDisconnected:
+ pass
+ finally:
+ logger.info("smtp connection is closed")
+
+def main():
+ parser = argparse.ArgumentParser(description="Send an email with optional attachments")
+ parser.add_argument('--sender', required=True, help='Sender email address')
+ parser.add_argument('--to', required=True, help='Recipient email address(es) (comma-separated)')
+ parser.add_argument('--subject', required=True, help='Email subject')
+ parser.add_argument('--body', required=True, help='Email body')
+ parser.add_argument('--smtp-user', required=True, help='SMTP server username')
+ parser.add_argument('--smtp-pwd', required=True, help='SMTP server password')
+ parser.add_argument('--smtp-server', required=True, help='SMTP server address and port')
+ parser.add_argument('--cc', default='', help='CC email address(es) (comma-separated)')
+ parser.add_argument('--bcc', default='', help='BCC email address(es) (comma-separated)')
+ parser.add_argument('--reply-to', default='', help='Reply-To email address')
+ parser.add_argument('--html-body', action='store_true', help='Flag to indicate if email body is HTML')
+ parser.add_argument('--attachments', default='', help='Attachment file path(s) (space-separated)')
+ args = parser.parse_args()
+
+ send_email(
+ sender_email=args.sender,
+ to_email=args.to,
+ subject=args.subject,
+ email_body=args.body,
+ smtp_user=args.smtp_user,
+ smtp_pwd=args.smtp_pwd,
+ smtp_email_server=args.smtp_server,
+ cc_email=args.cc,
+ bcc_email=args.bcc,
+ reply_email=args.reply_to,
+ is_html_body=args.html_body,
+ attachments=args.attachments
+ )
+
+if __name__ == '__main__':
+ main()
From 0796a834dd7e926774f00e3f13fc1fd001777b7d Mon Sep 17 00:00:00 2001
From: rajithkrishnegowda
<134698520+rajithkrishnegowda@users.noreply.github.com>
Date: Fri, 25 Apr 2025 13:16:06 +0530
Subject: [PATCH 15/34] Update trivy.yml
---
.github/workflows/trivy.yml | 46 ++++++++++++++++++++++---------------
1 file changed, 28 insertions(+), 18 deletions(-)
diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml
index 5fe05b50db..7c9b79761a 100644
--- a/.github/workflows/trivy.yml
+++ b/.github/workflows/trivy.yml
@@ -32,6 +32,11 @@ jobs:
ref: ${{ env.COMMIT_ID }}
fetch-depth: 0
+ - name: Set up Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: '3.10'
+
# ============ SCANNING PHASE ============
- name: Run filesystem scan
uses: aquasecurity/trivy-action@0.30.0
@@ -143,29 +148,34 @@ jobs:
env:
PYTHONIOENCODING: utf-8
- - name: Prepare HTML email content
+ - name: Prepare email content
id: prepare-email
run: |
- # Install markdown processor
+ # Convert markdown to HTML
python -m pip install markdown
+ HTML_CONTENT=$(python -c "import markdown; print(markdown.markdown(open('report.md').read()))")
+ echo "html_body<> $GITHUB_OUTPUT
+ echo "$HTML_CONTENT" >> $GITHUB_OUTPUT
+ echo "EOF" >> $GITHUB_OUTPUT
- # Convert markdown to HTML and properly escape for GITHUB_OUTPUT
- HTML_CONTENT=$(python -c "import markdown, json; print(json.dumps(markdown.markdown(open('report.md').read())))")
- echo "html_body=${HTML_CONTENT}" >> $GITHUB_OUTPUT
-
- - name: Send email notification
+ - name: Send email via Python script
if: always() && (steps.report.outputs.has_vulnerabilities == 'true' || failure())
- uses: actions-hub/smtp@master
- with:
- server: ${{ secrets.SMTP_SERVER }}
- port: ${{ secrets.SMTP_PORT }}
- username: ${{ secrets.SMTP_USERNAME }}
- password: ${{ secrets.SMTP_PASSWORD }}
- subject: ${{ steps.set-subject.outputs.subject }}
- html_body: ${{ fromJSON(steps.prepare-email.outputs.html_body) }}
- to: ${{ steps.codeowners.outputs.emails }}
- from: OpenFL Security Bot
- secure: true
+ env:
+ SMTP_SERVER: ${{ secrets.SMTP_SERVER }}
+ SMTP_PORT: ${{ secrets.SMTP_PORT }}
+ SMTP_USER: ${{ secrets.SMTP_USER }}
+ SMTP_PASSWORD: ${{ secrets.SMTP_PASSWORD }}
+ RECIPIENTS: ${{ steps.codeowners.outputs.emails }}
+ run: |
+ python .github/scripts/send_email.py \
+ --sender "security@openfl.github" \
+ --to "$RECIPIENTS" \
+ --subject "${{ steps.set-subject.outputs.subject }}" \
+ --body "${{ steps.prepare-email.outputs.html_body }}" \
+ --smtp-user "$SMTP_USER" \
+ --smtp-pwd "$SMTP_PASSWORD" \
+ --smtp-server "$SMTP_SERVER:$SMTP_PORT" \
+ --html-body
# ============ ARTIFACT UPLOADS ============
- name: Upload scan artifacts
From 7c3f5eb4df42a1ae16e9bd727dc0ee4c1d16df5b Mon Sep 17 00:00:00 2001
From: rajithkrishnegowda
<134698520+rajithkrishnegowda@users.noreply.github.com>
Date: Thu, 8 May 2025 21:05:03 +0530
Subject: [PATCH 16/34] Create phi-4-quanti.ipynb
---
.../workflow/LLM/phi-4-quanti.ipynb | 705 ++++++++++++++++++
1 file changed, 705 insertions(+)
create mode 100644 openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
new file mode 100644
index 0000000000..0c6884f384
--- /dev/null
+++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
@@ -0,0 +1,705 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "a59f475d-d843-46bc-b75e-10984b687ed3",
+ "metadata": {},
+ "source": [
+ "# Federated Fine-Tuning of Phi-4 Using OpenFL"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "20c74cb9-51a2-42e2-893f-d280e227e8bf",
+ "metadata": {},
+ "source": [
+ "\n",
+ "In this tutorial, we demonstrate how to fine-tune Microsoft's Phi-4 model in a federated learning workflow.\n",
+ "\n",
+ "We will fine-tune **Microsoft's [Phi4](https://huggingface.co/microsoft/phi-4)** model using a diverse dataset such as [Math_10k](https://github.com/AGI-Edgerunners/LLM-Adapters/tree/main), an open-source dataset containing mathematical question-answer pairs collected from various smaller math datasets."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d07c32d3-1a8d-4162-af45-bc3a10e0ae3f",
+ "metadata": {},
+ "source": [
+ "## The Workflow Interface"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e3d74610-e48d-4dd4-b622-eb910fbe91aa",
+ "metadata": {},
+ "source": [
+ "The workflow interface is an innovative approach to designing federated learning experiments with OpenFL. It was developed in response to discussions with researchers and users who had unique use cases that didn’t perfectly align with the traditional horizontal federated learning model. This interface enables more flexible compositions of experiments, allowing for greater customization and adaptability in complex, real-world scenarios"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "413e1d95-fd76-4fe0-b8d0-4c625c2a8fd3",
+ "metadata": {},
+ "source": [
+ "## Installing OpenFL\n",
+ "To install OpenFL, follow the official documentation: \n",
+ "[OpenFL Installation Guide](https://openfl.readthedocs.io/en/latest/installation.html)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "53654c70",
+ "metadata": {},
+ "source": [
+ "After installation, activate experimental APIs using: \n",
+ "`fx experimental activate`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "05b2ad75-8c7b-499c-902e-dbd5b24361bc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Install dependencies \n",
+ "!pip install torch transformers peft datasets trl==0.12.2 -q"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "440a9c39-ec42-45a5-80f6-9a9e0bc90d2f",
+ "metadata": {},
+ "source": [
+ "## Import libraries"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "be4690ae-0671-4d3a-8f21-620ab865a03e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import hashlib\n",
+ "import os\n",
+ "\n",
+ "import numpy as np\n",
+ "import requests\n",
+ "import torch\n",
+ "import transformers\n",
+ "from datasets import load_dataset\n",
+ "from peft import LoraConfig, get_peft_model\n",
+ "from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict\n",
+ "from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments\n",
+ "from transformers.trainer_callback import PrinterCallback\n",
+ "from trl import SFTTrainer\n",
+ "\n",
+ "from openfl.experimental.workflow.interface import Aggregator, Collaborator, FLSpec\n",
+ "from openfl.experimental.workflow.placement import aggregator, collaborator\n",
+ "from openfl.experimental.workflow.runtime import LocalRuntime"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "08576aa0-f628-4ae6-8fc3-dd167d164784",
+ "metadata": {},
+ "source": [
+ "## Acquiring and preprocessing dataset"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7ba1d8b6-8a5b-41a2-8c77-c9a85e869cda",
+ "metadata": {},
+ "source": [
+ "We can download the dataset directly from the [LLM-Adapters repository](https://github.com/AGI-Edgerunners/LLM-Adapters)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d615d626-8727-4169-b2a6-3ba15c3cdb95",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def file_checksum(file_path, algorithm=\"sha256\"):\n",
+ " \"\"\"\n",
+ " Calculate the checksum of a file using the specified hashing algorithm.\n",
+ "\n",
+ " Parameters:\n",
+ " file_path (str): The path to the file for which the checksum is to be calculated.\n",
+ " algorithm (str): The hashing algorithm to use (default is 'sha256').\n",
+ "\n",
+ " Returns:\n",
+ " str: The calculated checksum of the file.\n",
+ " \"\"\"\n",
+ " hash_func = hashlib.new(algorithm)\n",
+ " with open(file_path, \"rb\") as f:\n",
+ " for chunk in iter(lambda: f.read(4096), b\"\"):\n",
+ " hash_func.update(chunk)\n",
+ " return hash_func.hexdigest()\n",
+ "\n",
+ "\n",
+ "if not os.path.exists(\"math_10k.json\"):\n",
+ " r = requests.get(\n",
+ " \"https://raw.githubusercontent.com/AGI-Edgerunners/LLM-Adapters/main/ft-training_set/math_10k.json\",\n",
+ " )\n",
+ " with open(\n",
+ " \"math_10k.json\",\n",
+ " \"wb\",\n",
+ " ) as f:\n",
+ " f.write(r.content)\n",
+ "\n",
+ " actual_checksum = file_checksum(\"math_10k.json\")\n",
+ " if (\n",
+ " actual_checksum\n",
+ " != \"0342d0d860ad8592b579329337c90e42eefd3d9f2898043140cbd120630418b8\"\n",
+ " ):\n",
+ " raise ValueError(\n",
+ " \"Checksum verification failed. The file may have been altered.\"\n",
+ " )\n",
+ "\n",
+ "raw_dataset = load_dataset(\"json\", data_files=\"math_10k.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3ab15ad6-db35-4a58-a2d5-54a6d3ccdc78",
+ "metadata": {},
+ "source": [
+ "## Initialize arguments and configurations"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "eada9809-468a-47c6-9b03-55aa887c9487",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "training_config = {\n",
+ " \"bf16\": True,\n",
+ " \"use_cpu\": True,\n",
+ " \"do_eval\": False,\n",
+ " \"learning_rate\": 5.0e-06,\n",
+ " \"log_level\": \"info\",\n",
+ " \"logging_steps\": 20,\n",
+ " \"lr_scheduler_type\": \"cosine\",\n",
+ " \"num_train_epochs\": 1,\n",
+ " \"output_dir\": \"./checkpoint_dir\",\n",
+ " \"overwrite_output_dir\": True,\n",
+ " \"per_device_eval_batch_size\": 1,\n",
+ " \"per_device_train_batch_size\": 1,\n",
+ " \"save_steps\": 100,\n",
+ " \"save_total_limit\": 1,\n",
+ " \"seed\": 0,\n",
+ " \"gradient_checkpointing\": True,\n",
+ " \"gradient_checkpointing_kwargs\": {\"use_reentrant\": False},\n",
+ " \"warmup_ratio\": 0.2,\n",
+ "}\n",
+ "\n",
+ "peft_config = {\n",
+ " \"r\": 1,\n",
+ " \"lora_alpha\": 2,\n",
+ " \"lora_dropout\": 0.05,\n",
+ " \"bias\": \"none\",\n",
+ " \"task_type\": \"CAUSAL_LM\",\n",
+ " \"target_modules\": \"all-linear\",\n",
+ " \"modules_to_save\": None,\n",
+ "}\n",
+ "model_kwargs = dict(\n",
+ " use_cache=False,\n",
+ " trust_remote_code=True,\n",
+ " torch_dtype=torch.bfloat16,\n",
+ " device_map=None,\n",
+ ")\n",
+ "train_conf = TrainingArguments(**training_config)\n",
+ "peft_conf = LoraConfig(**peft_config)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ffe93234-2a1a-4809-a431-efe2f35ce496",
+ "metadata": {},
+ "source": [
+ "## Load and initialize model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8ab371f1-64c3-4225-82e7-fb3c5b05578c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "checkpoint_path = \"NyxKrage/Microsoft_Phi-4\"\n",
+ "model = AutoModelForCausalLM.from_pretrained(\n",
+ " checkpoint_path, return_dict=True, **model_kwargs\n",
+ ")\n",
+ "model = get_peft_model(model, peft_conf)\n",
+ "\n",
+ "tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)\n",
+ "sequence_max_length = 512\n",
+ "val_set_size = 2000\n",
+ "tokenizer.pad_token_id = 0 # we want this to be different from the eos token\n",
+ "tokenizer.padding_side = \"left\" # Allow batched inference"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dd058fff-f6dd-4cc6-acaf-7e2fa2c1132d",
+ "metadata": {},
+ "source": [
+ "## Preprocess dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4392ddab-10b7-41f6-a8e0-65ba298ea457",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def generate_prompt(data_point):\n",
+ " \"\"\"\n",
+ " Generate a prompt based on the given data point.\n",
+ "\n",
+ " Parameters:\n",
+ " data_point (dict): A dictionary containing the instruction, input, and output.\n",
+ "\n",
+ " Returns:\n",
+ " str: The generated prompt as a string.\n",
+ " \"\"\"\n",
+ " if data_point[\"input\"]:\n",
+ " return f\"\"\"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. \n",
+ "\n",
+ " ### Instruction:\n",
+ " {data_point[\"instruction\"]}\n",
+ " \n",
+ " ### Input:\n",
+ " {data_point[\"input\"]}\n",
+ " \n",
+ " ### Response:\n",
+ " {data_point[\"output\"]}\"\"\"\n",
+ " else:\n",
+ " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request. \n",
+ "\n",
+ " ### Instruction:\n",
+ " {data_point[\"instruction\"]}\n",
+ " \n",
+ " ### Response:\n",
+ " {data_point[\"output\"]}\"\"\"\n",
+ "\n",
+ "\n",
+ "def tokenize(prompt, add_eos_token=True):\n",
+ " \"\"\"\n",
+ " Tokenize the given prompt.\n",
+ "\n",
+ " Parameters:\n",
+ " prompt (str): The prompt to be tokenized.\n",
+ " add_eos_token (bool): Whether to add an end-of-sequence token (default is True).\n",
+ "\n",
+ " Returns:\n",
+ " dict: A dictionary containing the tokenized input IDs and attention mask.\n",
+ " \"\"\"\n",
+ " result = tokenizer(\n",
+ " prompt,\n",
+ " truncation=True,\n",
+ " max_length=sequence_max_length,\n",
+ " padding=False,\n",
+ " return_tensors=None,\n",
+ " )\n",
+ " if (\n",
+ " result[\"input_ids\"][-1] != tokenizer.eos_token_id\n",
+ " and len(result[\"input_ids\"]) < sequence_max_length\n",
+ " and add_eos_token\n",
+ " ):\n",
+ " result[\"input_ids\"].append(tokenizer.eos_token_id)\n",
+ " result[\"attention_mask\"].append(1)\n",
+ "\n",
+ " result[\"labels\"] = result[\"input_ids\"].copy()\n",
+ "\n",
+ " return result\n",
+ "\n",
+ "\n",
+ "def generate_and_tokenize_prompt(data_point):\n",
+ " \"\"\"\n",
+ " Generate and tokenize a prompt based on the given data point.\n",
+ "\n",
+ " Parameters:\n",
+ " data_point (dict): A dictionary containing the instruction, input, and output.\n",
+ "\n",
+ " Returns:\n",
+ " dict: A dictionary containing the tokenized input IDs, attention mask, and labels.\n",
+ " \"\"\"\n",
+ " full_prompt = generate_prompt(data_point)\n",
+ " tokenized_full_prompt = tokenize(full_prompt)\n",
+ " user_prompt = generate_prompt({**data_point, \"output\": \"\"})\n",
+ " tokenized_user_prompt = tokenize(user_prompt, add_eos_token=False)\n",
+ " user_prompt_len = len(tokenized_user_prompt[\"input_ids\"])\n",
+ "\n",
+ " tokenized_full_prompt[\"labels\"] = [-100] * user_prompt_len + tokenized_full_prompt[\n",
+ " \"labels\"\n",
+ " ][user_prompt_len:]\n",
+ " return tokenized_full_prompt\n",
+ "\n",
+ "\n",
+ "train_val = raw_dataset[\"train\"].train_test_split(\n",
+ " test_size=val_set_size, shuffle=True, seed=42\n",
+ ")\n",
+ "\n",
+ "processed_train_dataset = train_val[\"train\"].shuffle().map(generate_and_tokenize_prompt).select(range(3))\n",
+ "processed_test_dataset = train_val[\"test\"].shuffle().map(generate_and_tokenize_prompt).select(range(3))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "812cfcc8-33ec-4a2b-8a74-27bfc2a41d7b",
+ "metadata": {},
+ "source": [
+ "## Define Federated Averaging Method\n",
+ "The FedAvg method is used to average the models from all the collaborators after training."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6dc85c57-68b2-4514-9373-43e3d7c05c10",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def FedAvg(peft_params, model, weights=None):\n",
+ " \"\"\"\n",
+ " Perform Federated Averaging (FedAvg) on the model parameters.\n",
+ "\n",
+ " Parameters:\n",
+ " peft_params (list): A list of state dictionaries containing the model parameters from different clients.\n",
+ " model (torch.nn.Module): The model to which the averaged parameters will be applied.\n",
+ " weights (list, optional): A list of weights for averaging the parameters. If None, equal weights are used.\n",
+ "\n",
+ " Returns:\n",
+ " torch.nn.Module: The model with the averaged parameters applied.\n",
+ " \"\"\"\n",
+ " state_dicts = peft_params\n",
+ " state_dict = get_peft_model_state_dict(model)\n",
+ " for key in peft_params[0]:\n",
+ " dtype = state_dicts[0][key].dtype\n",
+ " state_dict[key] = torch.from_numpy(\n",
+ " np.average(\n",
+ " [state[key].to(torch.float).numpy() for state in state_dicts], axis=0, weights=weights\n",
+ " )\n",
+ " ).to(dtype)\n",
+ " set_peft_model_state_dict(model, state_dict)\n",
+ " return model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "810eb75e",
+ "metadata": {},
+ "source": [
+ "Now we come to the flow definition. The OpenFL Workflow Interface adopts the conventions set by Metaflow, that every workflow begins with `start` and concludes with the `end` task. The aggregator begins with an optionally passed in model and optimizer. The aggregator begins the flow with the `start` task, where the list of collaborators is extracted from the runtime (`self.collaborators = self.runtime.collaborators`) and is then used as the list of participants to run the task listed in `self.next`, `aggregated_model_validation`. The model, optimizer, and anything that is not explicitly excluded from the next function will be passed from the `start` function on the aggregator to the `aggregated_model_validation` task on the collaborator. Where the tasks run is determined by the placement decorator that precedes each task definition (`@aggregator` or `@collaborator`). Once each of the collaborators (defined in the runtime) complete the `aggregated_model_validation` task, they pass their current state onto the `train` task, from `train` to `local_model_validation`, and then finally to `join` at the aggregator. It is in `join` that an average is taken of the model weights, and the next round can begin.\n",
+ "\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "58298e8e-ab9e-4377-966e-143823441697",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class FederatedFlow(FLSpec):\n",
+ " def __init__(self, model=None, optimizer=None, rounds=3, **kwargs):\n",
+ " \"\"\"\n",
+ " Initialize the class with the given model, optimizer, and number of rounds.\n",
+ "\n",
+ " Parameters:\n",
+ " model (torch.nn.Module, optional): The model to be used. If None, a ValueError is raised.\n",
+ " optimizer (torch.optim.Optimizer, optional): The optimizer to be used.\n",
+ " rounds (int, optional): The number of rounds for training or processing (default is 3).\n",
+ " **kwargs: Additional keyword arguments to be passed to the superclass initializer.\n",
+ "\n",
+ " Raises:\n",
+ " ValueError: If no model is provided.\n",
+ " \"\"\"\n",
+ " super().__init__(**kwargs)\n",
+ " if model is not None:\n",
+ " self.model = model\n",
+ " self.peft_params = get_peft_model_state_dict(self.model)\n",
+ " self.optimizer = optimizer\n",
+ " else:\n",
+ " raise ValueError(\"No model inputted\")\n",
+ "\n",
+ " self.rounds = rounds\n",
+ " \n",
+ "\n",
+ " @aggregator\n",
+ " def start(self):\n",
+ " \"\"\"\n",
+ " Initialize the model and set up the collaborators for federated learning.\n",
+ "\n",
+ " This method performs the initial setup for the model, including setting the\n",
+ " collaborators, initializing private variables, and starting the first round\n",
+ " of the federated learning process.\n",
+ " \"\"\"\n",
+ " print(f\"Performing initialization for model\")\n",
+ " self.collaborators = self.runtime.collaborators\n",
+ " self.current_round = 0\n",
+ " self.next(\n",
+ " self.aggregated_model_validation,\n",
+ " foreach=\"collaborators\",\n",
+ " )\n",
+ "\n",
+ " \n",
+ " @collaborator\n",
+ " def aggregated_model_validation(self):\n",
+ " \"\"\"\n",
+ " Perform aggregated model validation for a collaborator.\n",
+ "\n",
+ " This method loads the model, applies the PEFT configuration, and evaluates\n",
+ " the model using the provided training and evaluation datasets. The validation\n",
+ " score is then stored and the next step in the process is triggered.\n",
+ " \"\"\"\n",
+ " print(f\"Performing aggregated model validation for collaborator {self.input}\")\n",
+ " self.model = AutoModelForCausalLM.from_pretrained(\n",
+ " checkpoint_path, return_dict=True, **model_kwargs\n",
+ " )\n",
+ " self.model = get_peft_model(self.model, peft_conf)\n",
+ " set_peft_model_state_dict(self.model, self.peft_params)\n",
+ " trainer = SFTTrainer(\n",
+ " model=self.model,\n",
+ " args=train_conf,\n",
+ " peft_config=peft_conf,\n",
+ " train_dataset=self.train_dataset,\n",
+ " eval_dataset=self.eval_dataset,\n",
+ " max_seq_length=sequence_max_length,\n",
+ " dataset_text_field=\"text\",\n",
+ " tokenizer=tokenizer,\n",
+ " packing=True,\n",
+ " data_collator=transformers.DataCollatorForSeq2Seq(\n",
+ " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n",
+ " ),\n",
+ " )\n",
+ "\n",
+ " trainer.remove_callback(PrinterCallback)\n",
+ " out = trainer.evaluate()\n",
+ " self.agg_validation_score = out[\"eval_loss\"]\n",
+ " print(f\"{self.input} value of {self.agg_validation_score}\")\n",
+ " self.next(self.train)\n",
+ "\n",
+ " @collaborator\n",
+ " def train(self):\n",
+ " \"\"\"\n",
+ " Train the model for a collaborator.\n",
+ "\n",
+ " This method trains the model using the provided training and evaluation datasets.\n",
+ " The training loss is stored, the model is saved, and the next step in the process\n",
+ " is triggered.\n",
+ " \"\"\"\n",
+ " trainer = SFTTrainer(\n",
+ " model=self.model,\n",
+ " args=train_conf,\n",
+ " peft_config=peft_conf,\n",
+ " train_dataset=self.train_dataset,\n",
+ " eval_dataset=self.eval_dataset,\n",
+ " max_seq_length=sequence_max_length,\n",
+ " dataset_text_field=\"text\",\n",
+ " tokenizer=tokenizer,\n",
+ " packing=True,\n",
+ " data_collator=transformers.DataCollatorForSeq2Seq(\n",
+ " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n",
+ " ),\n",
+ " )\n",
+ "\n",
+ " out = trainer.train()\n",
+ " self.loss = out.training_loss\n",
+ " trainer.save_model()\n",
+ " self.training_completed = True\n",
+ " self.next(self.local_model_validation)\n",
+ "\n",
+ " @collaborator\n",
+ " def local_model_validation(self):\n",
+ " \"\"\"\n",
+ " Perform local model validation for a collaborator.\n",
+ "\n",
+ " This method evaluates the model using the provided training and evaluation datasets.\n",
+ " The validation score is stored, the PEFT parameters are updated, and the next step\n",
+ " in the process is triggered.\n",
+ " \"\"\"\n",
+ " trainer = SFTTrainer(\n",
+ " model=self.model,\n",
+ " args=train_conf,\n",
+ " peft_config=peft_conf,\n",
+ " train_dataset=processed_train_dataset,\n",
+ " eval_dataset=processed_test_dataset,\n",
+ " max_seq_length=sequence_max_length,\n",
+ " dataset_text_field=\"text\",\n",
+ " tokenizer=tokenizer,\n",
+ " packing=True,\n",
+ " data_collator=transformers.DataCollatorForSeq2Seq(\n",
+ " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n",
+ " ),\n",
+ " )\n",
+ " out = trainer.evaluate()\n",
+ " self.local_validation_score = out[\"eval_loss\"]\n",
+ " self.peft_params = get_peft_model_state_dict(self.model)\n",
+ " print(f\"Doing local model validation for collaborator {self.input}\")\n",
+ " self.next(self.join, exclude=[\"training_completed\", \"model\"])\n",
+ "\n",
+ " @aggregator\n",
+ " def join(self, inputs):\n",
+ " \"\"\"\n",
+ " Aggregate the results from all collaborators and update the model.\n",
+ "\n",
+ " This method calculates the average loss, aggregated model accuracy, and local model\n",
+ " accuracy from all collaborators. The model parameters are updated using Federated\n",
+ " Averaging (FedAvg), and the next round of the process is triggered if applicable.\n",
+ " \"\"\"\n",
+ " self.average_loss = sum(input.loss for input in inputs) / len(inputs)\n",
+ " self.aggregated_model_accuracy = sum(\n",
+ " input.agg_validation_score for input in inputs\n",
+ " ) / len(inputs)\n",
+ " self.local_model_accuracy = sum(\n",
+ " input.local_validation_score for input in inputs\n",
+ " ) / len(inputs)\n",
+ " print(\n",
+ " f\"Average aggregated model validation values = {self.aggregated_model_accuracy}\"\n",
+ " )\n",
+ " print(f\"Average training loss = {self.average_loss}\")\n",
+ " print(f\"Average local model validation values = {self.local_model_accuracy}\")\n",
+ "\n",
+ " self.model = FedAvg([input.peft_params for input in inputs], self.model)\n",
+ " self.peft_params = get_peft_model_state_dict(self.model)\n",
+ "\n",
+ " self.model.save_pretrained(\"./aggregated/model\")\n",
+ " tokenizer.save_pretrained(\"./aggregated/tokenizer\")\n",
+ " self.current_round += 1\n",
+ " if self.current_round < self.rounds:\n",
+ " self.next(\n",
+ " self.aggregated_model_validation,\n",
+ " foreach=\"collaborators\",\n",
+ " exclude=[\"model\"],\n",
+ " )\n",
+ " else:\n",
+ " self.next(self.end)\n",
+ "\n",
+ " @aggregator\n",
+ " def end(self):\n",
+ " \"\"\"\n",
+ " End the federated learning process.\n",
+ "\n",
+ " This method marks the end of the federated learning process and performs any\n",
+ " necessary cleanup or finalization steps.\n",
+ " \"\"\"\n",
+ " print(f\"This is the end of the flow\")\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e120a656-f4a5-47a5-a3d4-62c5f3672bba",
+ "metadata": {},
+ "source": [
+ "You'll notice in the `FederatedFlow` definition above that there were certain attributes that the flow was not initialized with, namely the `train_dataset` and `eval_dataset` for each of the collaborators. These are **private_attributes** that are exposed only through the runtime. Each participant has its own set of private attributes: a dictionary where the key is the attribute name, and the value is the object that will be made accessible through that participant's task.\n",
+ "\n",
+ "Below, we segment shards of the Math_10k dataset for **two collaborators**: Portland and Seattle. Each has their own slice of the dataset that's accessible via the `train_dataset` or `eval_dataset` attribute. Note that the private attributes are flexible, and you can choose to pass in a completely different type of object to any of the collaborators or aggregator (with an arbitrary name). These private attributes will always be filtered out of the current state when transferring from collaborator to aggregator, or vice versa."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e5e108c6-5150-4931-9c01-6b64a913fa04",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Setup participants\n",
+ "_aggregator = Aggregator()\n",
+ "_aggregator.private_attributes = {}\n",
+ "\n",
+ "# Setup collaborators with private attributes\n",
+ "collaborator_names = [\n",
+ " \"Portland\",\n",
+ " \"Seattle\",\n",
+ "]\n",
+ "_collaborators = [Collaborator(name=name) for name in collaborator_names]\n",
+ "\n",
+ "for idx, current_collaborator in enumerate(_collaborators):\n",
+ " # Set the private attributes of the Collaborator to include their specific training and testing data loaders\n",
+ " current_collaborator.private_attributes = {\n",
+ " \"train_dataset\": processed_train_dataset.shard(\n",
+ " num_shards=len(_collaborators), index=idx\n",
+ " ),\n",
+ " \"eval_dataset\": processed_test_dataset.shard(\n",
+ " num_shards=len(_collaborators), index=idx\n",
+ " ),\n",
+ " }\n",
+ "\n",
+ "local_runtime = LocalRuntime(\n",
+ " aggregator=_aggregator, collaborators=_collaborators, backend=\"single_process\"\n",
+ ")\n",
+ "print(f\"Local runtime collaborators = {local_runtime.collaborators}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9cb61fc0",
+ "metadata": {},
+ "source": [
+ "## Run Experiment"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "38894111-41d9-4dd4-b1c8-eb7ec3cdd3e1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "flflow = FederatedFlow(model, rounds=2)\n",
+ "flflow.runtime = local_runtime\n",
+ "flflow.run()\n",
+ "\n",
+ "# Determine the final model accuracy:\n",
+ "print(f'\\nFinal aggregated model accuracy for {flflow.rounds} rounds of training: {flflow.aggregated_model_accuracy}')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7bc8fe27",
+ "metadata": {},
+ "source": [
+ "## 🎉 Congratulations! 🎉\n",
+ "\n",
+ "Now that you've completed this notebook, check out our [other tutorials](https://github.com/securefederatedai/openfl/tree/develop/openfl-tutorials/experimental/)\n",
+ "\n",
+ "- Using the LocalRuntime Ray Backend for dedicated GPU access\n",
+ "- Vertical Federated Learning\n",
+ "- Model Watermarking\n",
+ "- Differential Privacy\n",
+ "- And More!"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
From 35667d04fee574c828ee182b022c1a60c5f4fb5e Mon Sep 17 00:00:00 2001
From: rajithkrishnegowda
<134698520+rajithkrishnegowda@users.noreply.github.com>
Date: Mon, 12 May 2025 15:11:01 +0530
Subject: [PATCH 17/34] Update phi-4-quanti.ipynb
---
.../workflow/LLM/phi-4-quanti.ipynb | 425 +++---------------
1 file changed, 59 insertions(+), 366 deletions(-)
diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
index 0c6884f384..33281dabb0 100644
--- a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
+++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
@@ -5,7 +5,7 @@
"id": "a59f475d-d843-46bc-b75e-10984b687ed3",
"metadata": {},
"source": [
- "# Federated Fine-Tuning of Phi-4 Using OpenFL"
+ "# Federated Fine-Tuning of Phi-4 Using OpenFL with 8-bit Quantization"
]
},
{
@@ -13,10 +13,7 @@
"id": "20c74cb9-51a2-42e2-893f-d280e227e8bf",
"metadata": {},
"source": [
- "\n",
- "In this tutorial, we demonstrate how to fine-tune Microsoft's Phi-4 model in a federated learning workflow.\n",
- "\n",
- "We will fine-tune **Microsoft's [Phi4](https://huggingface.co/microsoft/phi-4)** model using a diverse dataset such as [Math_10k](https://github.com/AGI-Edgerunners/LLM-Adapters/tree/main), an open-source dataset containing mathematical question-answer pairs collected from various smaller math datasets."
+ "In this tutorial, we demonstrate how to fine-tune Microsoft's Phi-4 model in a federated learning workflow with 8-bit quantization."
]
},
{
@@ -24,34 +21,7 @@
"id": "d07c32d3-1a8d-4162-af45-bc3a10e0ae3f",
"metadata": {},
"source": [
- "## The Workflow Interface"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "e3d74610-e48d-4dd4-b622-eb910fbe91aa",
- "metadata": {},
- "source": [
- "The workflow interface is an innovative approach to designing federated learning experiments with OpenFL. It was developed in response to discussions with researchers and users who had unique use cases that didn’t perfectly align with the traditional horizontal federated learning model. This interface enables more flexible compositions of experiments, allowing for greater customization and adaptability in complex, real-world scenarios"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "413e1d95-fd76-4fe0-b8d0-4c625c2a8fd3",
- "metadata": {},
- "source": [
- "## Installing OpenFL\n",
- "To install OpenFL, follow the official documentation: \n",
- "[OpenFL Installation Guide](https://openfl.readthedocs.io/en/latest/installation.html)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "53654c70",
- "metadata": {},
- "source": [
- "After installation, activate experimental APIs using: \n",
- "`fx experimental activate`"
+ "## Installation"
]
},
{
@@ -61,8 +31,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# Install dependencies \n",
- "!pip install torch transformers peft datasets trl==0.12.2 -q"
+ "!pip install torch transformers peft datasets trl==0.12.2 bitsandbytes accelerate -q"
]
},
{
@@ -70,7 +39,7 @@
"id": "440a9c39-ec42-45a5-80f6-9a9e0bc90d2f",
"metadata": {},
"source": [
- "## Import libraries"
+ "## Import Libraries"
]
},
{
@@ -90,7 +59,7 @@
"from datasets import load_dataset\n",
"from peft import LoraConfig, get_peft_model\n",
"from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict\n",
- "from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments\n",
+ "from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig\n",
"from transformers.trainer_callback import PrinterCallback\n",
"from trl import SFTTrainer\n",
"\n",
@@ -104,15 +73,7 @@
"id": "08576aa0-f628-4ae6-8fc3-dd167d164784",
"metadata": {},
"source": [
- "## Acquiring and preprocessing dataset"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7ba1d8b6-8a5b-41a2-8c77-c9a85e869cda",
- "metadata": {},
- "source": [
- "We can download the dataset directly from the [LLM-Adapters repository](https://github.com/AGI-Edgerunners/LLM-Adapters)"
+ "## Dataset Preparation"
]
},
{
@@ -123,41 +84,22 @@
"outputs": [],
"source": [
"def file_checksum(file_path, algorithm=\"sha256\"):\n",
- " \"\"\"\n",
- " Calculate the checksum of a file using the specified hashing algorithm.\n",
- "\n",
- " Parameters:\n",
- " file_path (str): The path to the file for which the checksum is to be calculated.\n",
- " algorithm (str): The hashing algorithm to use (default is 'sha256').\n",
- "\n",
- " Returns:\n",
- " str: The calculated checksum of the file.\n",
- " \"\"\"\n",
" hash_func = hashlib.new(algorithm)\n",
" with open(file_path, \"rb\") as f:\n",
" for chunk in iter(lambda: f.read(4096), b\"\"):\n",
" hash_func.update(chunk)\n",
" return hash_func.hexdigest()\n",
"\n",
- "\n",
"if not os.path.exists(\"math_10k.json\"):\n",
" r = requests.get(\n",
" \"https://raw.githubusercontent.com/AGI-Edgerunners/LLM-Adapters/main/ft-training_set/math_10k.json\",\n",
" )\n",
- " with open(\n",
- " \"math_10k.json\",\n",
- " \"wb\",\n",
- " ) as f:\n",
+ " with open(\"math_10k.json\", \"wb\") as f:\n",
" f.write(r.content)\n",
"\n",
" actual_checksum = file_checksum(\"math_10k.json\")\n",
- " if (\n",
- " actual_checksum\n",
- " != \"0342d0d860ad8592b579329337c90e42eefd3d9f2898043140cbd120630418b8\"\n",
- " ):\n",
- " raise ValueError(\n",
- " \"Checksum verification failed. The file may have been altered.\"\n",
- " )\n",
+ " if actual_checksum != \"0342d0d860ad8592b579329337c90e42eefd3d9f2898043140cbd120630418b8\":\n",
+ " raise ValueError(\"Checksum verification failed. The file may have been altered.\")\n",
"\n",
"raw_dataset = load_dataset(\"json\", data_files=\"math_10k.json\")"
]
@@ -167,7 +109,7 @@
"id": "3ab15ad6-db35-4a58-a2d5-54a6d3ccdc78",
"metadata": {},
"source": [
- "## Initialize arguments and configurations"
+ "## Configuration with 8-bit Quantization"
]
},
{
@@ -177,9 +119,16 @@
"metadata": {},
"outputs": [],
"source": [
+ "# 8-bit quantization config\n",
+ "quantization_config = BitsAndBytesConfig(\n",
+ " load_in_8bit=True,\n",
+ " llm_int8_threshold=6.0,\n",
+ " llm_int8_has_fp16_weight=False,\n",
+ ")\n",
+ "\n",
"training_config = {\n",
" \"bf16\": True,\n",
- " \"use_cpu\": True,\n",
+ " \"use_cpu\": False,\n",
" \"do_eval\": False,\n",
" \"learning_rate\": 5.0e-06,\n",
" \"log_level\": \"info\",\n",
@@ -196,23 +145,26 @@
" \"gradient_checkpointing\": True,\n",
" \"gradient_checkpointing_kwargs\": {\"use_reentrant\": False},\n",
" \"warmup_ratio\": 0.2,\n",
+ " \"optim\": \"adamw_8bit\", # Special 8-bit optimizer\n",
"}\n",
"\n",
"peft_config = {\n",
- " \"r\": 1,\n",
- " \"lora_alpha\": 2,\n",
- " \"lora_dropout\": 0.05,\n",
+ " \"r\": 8,\n",
+ " \"lora_alpha\": 16,\n",
+ " \"lora_dropout\": 0.1,\n",
" \"bias\": \"none\",\n",
" \"task_type\": \"CAUSAL_LM\",\n",
- " \"target_modules\": \"all-linear\",\n",
- " \"modules_to_save\": None,\n",
+ " \"target_modules\": [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
+ " \"modules_to_save\": [\"lm_head\", \"embed_tokens\"],\n",
"}\n",
+ "\n",
"model_kwargs = dict(\n",
" use_cache=False,\n",
" trust_remote_code=True,\n",
- " torch_dtype=torch.bfloat16,\n",
- " device_map=None,\n",
+ " quantization_config=quantization_config,\n",
+ " device_map=\"auto\",\n",
")\n",
+ "\n",
"train_conf = TrainingArguments(**training_config)\n",
"peft_conf = LoraConfig(**peft_config)"
]
@@ -222,7 +174,7 @@
"id": "ffe93234-2a1a-4809-a431-efe2f35ce496",
"metadata": {},
"source": [
- "## Load and initialize model"
+ "## Load Quantized Model"
]
},
{
@@ -232,7 +184,7 @@
"metadata": {},
"outputs": [],
"source": [
- "checkpoint_path = \"NyxKrage/Microsoft_Phi-4\"\n",
+ "checkpoint_path = \"microsoft/phi-1_5\"\n",
"model = AutoModelForCausalLM.from_pretrained(\n",
" checkpoint_path, return_dict=True, **model_kwargs\n",
")\n",
@@ -241,8 +193,8 @@
"tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)\n",
"sequence_max_length = 512\n",
"val_set_size = 2000\n",
- "tokenizer.pad_token_id = 0 # we want this to be different from the eos token\n",
- "tokenizer.padding_side = \"left\" # Allow batched inference"
+ "tokenizer.pad_token_id = 0\n",
+ "tokenizer.padding_side = \"left\""
]
},
{
@@ -250,7 +202,7 @@
"id": "dd058fff-f6dd-4cc6-acaf-7e2fa2c1132d",
"metadata": {},
"source": [
- "## Preprocess dataset"
+ "## Dataset Preprocessing"
]
},
{
@@ -261,15 +213,6 @@
"outputs": [],
"source": [
"def generate_prompt(data_point):\n",
- " \"\"\"\n",
- " Generate a prompt based on the given data point.\n",
- "\n",
- " Parameters:\n",
- " data_point (dict): A dictionary containing the instruction, input, and output.\n",
- "\n",
- " Returns:\n",
- " str: The generated prompt as a string.\n",
- " \"\"\"\n",
" if data_point[\"input\"]:\n",
" return f\"\"\"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. \n",
"\n",
@@ -290,18 +233,7 @@
" ### Response:\n",
" {data_point[\"output\"]}\"\"\"\n",
"\n",
- "\n",
"def tokenize(prompt, add_eos_token=True):\n",
- " \"\"\"\n",
- " Tokenize the given prompt.\n",
- "\n",
- " Parameters:\n",
- " prompt (str): The prompt to be tokenized.\n",
- " add_eos_token (bool): Whether to add an end-of-sequence token (default is True).\n",
- "\n",
- " Returns:\n",
- " dict: A dictionary containing the tokenized input IDs and attention mask.\n",
- " \"\"\"\n",
" result = tokenizer(\n",
" prompt,\n",
" truncation=True,\n",
@@ -318,36 +250,19 @@
" result[\"attention_mask\"].append(1)\n",
"\n",
" result[\"labels\"] = result[\"input_ids\"].copy()\n",
- "\n",
" return result\n",
"\n",
- "\n",
"def generate_and_tokenize_prompt(data_point):\n",
- " \"\"\"\n",
- " Generate and tokenize a prompt based on the given data point.\n",
- "\n",
- " Parameters:\n",
- " data_point (dict): A dictionary containing the instruction, input, and output.\n",
- "\n",
- " Returns:\n",
- " dict: A dictionary containing the tokenized input IDs, attention mask, and labels.\n",
- " \"\"\"\n",
" full_prompt = generate_prompt(data_point)\n",
" tokenized_full_prompt = tokenize(full_prompt)\n",
" user_prompt = generate_prompt({**data_point, \"output\": \"\"})\n",
" tokenized_user_prompt = tokenize(user_prompt, add_eos_token=False)\n",
" user_prompt_len = len(tokenized_user_prompt[\"input_ids\"])\n",
"\n",
- " tokenized_full_prompt[\"labels\"] = [-100] * user_prompt_len + tokenized_full_prompt[\n",
- " \"labels\"\n",
- " ][user_prompt_len:]\n",
+ " tokenized_full_prompt[\"labels\"] = [-100] * user_prompt_len + tokenized_full_prompt[\"labels\"][user_prompt_len:]\n",
" return tokenized_full_prompt\n",
"\n",
- "\n",
- "train_val = raw_dataset[\"train\"].train_test_split(\n",
- " test_size=val_set_size, shuffle=True, seed=42\n",
- ")\n",
- "\n",
+ "train_val = raw_dataset[\"train\"].train_test_split(test_size=val_set_size, shuffle=True, seed=42)\n",
"processed_train_dataset = train_val[\"train\"].shuffle().map(generate_and_tokenize_prompt).select(range(3))\n",
"processed_test_dataset = train_val[\"test\"].shuffle().map(generate_and_tokenize_prompt).select(range(3))"
]
@@ -357,8 +272,7 @@
"id": "812cfcc8-33ec-4a2b-8a74-27bfc2a41d7b",
"metadata": {},
"source": [
- "## Define Federated Averaging Method\n",
- "The FedAvg method is used to average the models from all the collaborators after training."
+ "## Federated Averaging with Quantization Support"
]
},
{
@@ -369,17 +283,6 @@
"outputs": [],
"source": [
"def FedAvg(peft_params, model, weights=None):\n",
- " \"\"\"\n",
- " Perform Federated Averaging (FedAvg) on the model parameters.\n",
- "\n",
- " Parameters:\n",
- " peft_params (list): A list of state dictionaries containing the model parameters from different clients.\n",
- " model (torch.nn.Module): The model to which the averaged parameters will be applied.\n",
- " weights (list, optional): A list of weights for averaging the parameters. If None, equal weights are used.\n",
- "\n",
- " Returns:\n",
- " torch.nn.Module: The model with the averaged parameters applied.\n",
- " \"\"\"\n",
" state_dicts = peft_params\n",
" state_dict = get_peft_model_state_dict(model)\n",
" for key in peft_params[0]:\n",
@@ -389,6 +292,8 @@
" [state[key].to(torch.float).numpy() for state in state_dicts], axis=0, weights=weights\n",
" )\n",
" ).to(dtype)\n",
+ " \n",
+ " # Handle quantization when setting state dict\n",
" set_peft_model_state_dict(model, state_dict)\n",
" return model"
]
@@ -398,9 +303,7 @@
"id": "810eb75e",
"metadata": {},
"source": [
- "Now we come to the flow definition. The OpenFL Workflow Interface adopts the conventions set by Metaflow, that every workflow begins with `start` and concludes with the `end` task. The aggregator begins with an optionally passed in model and optimizer. The aggregator begins the flow with the `start` task, where the list of collaborators is extracted from the runtime (`self.collaborators = self.runtime.collaborators`) and is then used as the list of participants to run the task listed in `self.next`, `aggregated_model_validation`. The model, optimizer, and anything that is not explicitly excluded from the next function will be passed from the `start` function on the aggregator to the `aggregated_model_validation` task on the collaborator. Where the tasks run is determined by the placement decorator that precedes each task definition (`@aggregator` or `@collaborator`). Once each of the collaborators (defined in the runtime) complete the `aggregated_model_validation` task, they pass their current state onto the `train` task, from `train` to `local_model_validation`, and then finally to `join` at the aggregator. It is in `join` that an average is taken of the model weights, and the next round can begin.\n",
- "\n",
- ""
+ "## Federated Learning Workflow with Quantization"
]
},
{
@@ -412,62 +315,42 @@
"source": [
"class FederatedFlow(FLSpec):\n",
" def __init__(self, model=None, optimizer=None, rounds=3, **kwargs):\n",
- " \"\"\"\n",
- " Initialize the class with the given model, optimizer, and number of rounds.\n",
- "\n",
- " Parameters:\n",
- " model (torch.nn.Module, optional): The model to be used. If None, a ValueError is raised.\n",
- " optimizer (torch.optim.Optimizer, optional): The optimizer to be used.\n",
- " rounds (int, optional): The number of rounds for training or processing (default is 3).\n",
- " **kwargs: Additional keyword arguments to be passed to the superclass initializer.\n",
- "\n",
- " Raises:\n",
- " ValueError: If no model is provided.\n",
- " \"\"\"\n",
" super().__init__(**kwargs)\n",
" if model is not None:\n",
" self.model = model\n",
" self.peft_params = get_peft_model_state_dict(self.model)\n",
" self.optimizer = optimizer\n",
+ " \n",
+ " # Print memory usage\n",
+ " print(f\"Model memory footprint: {self.model.get_memory_footprint() / 1024**2:.2f} MB\")\n",
+ " \n",
+ " # Print trainable parameters\n",
+ " trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)\n",
+ " total_params = sum(p.numel() for p in self.model.parameters())\n",
+ " print(f\"Trainable params: {trainable_params} || All params: {total_params} || Trainable%: {100 * trainable_params / total_params:.2f}\")\n",
" else:\n",
" raise ValueError(\"No model inputted\")\n",
"\n",
" self.rounds = rounds\n",
- " \n",
"\n",
" @aggregator\n",
" def start(self):\n",
- " \"\"\"\n",
- " Initialize the model and set up the collaborators for federated learning.\n",
- "\n",
- " This method performs the initial setup for the model, including setting the\n",
- " collaborators, initializing private variables, and starting the first round\n",
- " of the federated learning process.\n",
- " \"\"\"\n",
" print(f\"Performing initialization for model\")\n",
" self.collaborators = self.runtime.collaborators\n",
" self.current_round = 0\n",
- " self.next(\n",
- " self.aggregated_model_validation,\n",
- " foreach=\"collaborators\",\n",
- " )\n",
+ " self.next(self.aggregated_model_validation, foreach=\"collaborators\")\n",
"\n",
- " \n",
" @collaborator\n",
" def aggregated_model_validation(self):\n",
- " \"\"\"\n",
- " Perform aggregated model validation for a collaborator.\n",
- "\n",
- " This method loads the model, applies the PEFT configuration, and evaluates\n",
- " the model using the provided training and evaluation datasets. The validation\n",
- " score is then stored and the next step in the process is triggered.\n",
- " \"\"\"\n",
" print(f\"Performing aggregated model validation for collaborator {self.input}\")\n",
+ " \n",
+ " # Load model with 8-bit quantization\n",
" self.model = AutoModelForCausalLM.from_pretrained(\n",
" checkpoint_path, return_dict=True, **model_kwargs\n",
" )\n",
" self.model = get_peft_model(self.model, peft_conf)\n",
" set_peft_model_state_dict(self.model, self.peft_params)\n",
+ " \n",
" trainer = SFTTrainer(\n",
" model=self.model,\n",
" args=train_conf,\n",
@@ -491,13 +374,12 @@
"\n",
" @collaborator\n",
" def train(self):\n",
- " \"\"\"\n",
- " Train the model for a collaborator.\n",
- "\n",
- " This method trains the model using the provided training and evaluation datasets.\n",
- " The training loss is stored, the model is saved, and the next step in the process\n",
- " is triggered.\n",
- " \"\"\"\n",
+ " print(f\"Training on collaborator {self.input}\")\n",
+ " \n",
+ " # Enable gradient checkpointing for memory efficiency\n",
+ " self.model.gradient_checkpointing_enable()\n",
+ " self.model.config.use_cache = False\n",
+ " \n",
" trainer = SFTTrainer(\n",
" model=self.model,\n",
" args=train_conf,\n",
@@ -513,193 +395,4 @@
" ),\n",
" )\n",
"\n",
- " out = trainer.train()\n",
- " self.loss = out.training_loss\n",
- " trainer.save_model()\n",
- " self.training_completed = True\n",
- " self.next(self.local_model_validation)\n",
- "\n",
- " @collaborator\n",
- " def local_model_validation(self):\n",
- " \"\"\"\n",
- " Perform local model validation for a collaborator.\n",
- "\n",
- " This method evaluates the model using the provided training and evaluation datasets.\n",
- " The validation score is stored, the PEFT parameters are updated, and the next step\n",
- " in the process is triggered.\n",
- " \"\"\"\n",
- " trainer = SFTTrainer(\n",
- " model=self.model,\n",
- " args=train_conf,\n",
- " peft_config=peft_conf,\n",
- " train_dataset=processed_train_dataset,\n",
- " eval_dataset=processed_test_dataset,\n",
- " max_seq_length=sequence_max_length,\n",
- " dataset_text_field=\"text\",\n",
- " tokenizer=tokenizer,\n",
- " packing=True,\n",
- " data_collator=transformers.DataCollatorForSeq2Seq(\n",
- " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n",
- " ),\n",
- " )\n",
- " out = trainer.evaluate()\n",
- " self.local_validation_score = out[\"eval_loss\"]\n",
- " self.peft_params = get_peft_model_state_dict(self.model)\n",
- " print(f\"Doing local model validation for collaborator {self.input}\")\n",
- " self.next(self.join, exclude=[\"training_completed\", \"model\"])\n",
- "\n",
- " @aggregator\n",
- " def join(self, inputs):\n",
- " \"\"\"\n",
- " Aggregate the results from all collaborators and update the model.\n",
- "\n",
- " This method calculates the average loss, aggregated model accuracy, and local model\n",
- " accuracy from all collaborators. The model parameters are updated using Federated\n",
- " Averaging (FedAvg), and the next round of the process is triggered if applicable.\n",
- " \"\"\"\n",
- " self.average_loss = sum(input.loss for input in inputs) / len(inputs)\n",
- " self.aggregated_model_accuracy = sum(\n",
- " input.agg_validation_score for input in inputs\n",
- " ) / len(inputs)\n",
- " self.local_model_accuracy = sum(\n",
- " input.local_validation_score for input in inputs\n",
- " ) / len(inputs)\n",
- " print(\n",
- " f\"Average aggregated model validation values = {self.aggregated_model_accuracy}\"\n",
- " )\n",
- " print(f\"Average training loss = {self.average_loss}\")\n",
- " print(f\"Average local model validation values = {self.local_model_accuracy}\")\n",
- "\n",
- " self.model = FedAvg([input.peft_params for input in inputs], self.model)\n",
- " self.peft_params = get_peft_model_state_dict(self.model)\n",
- "\n",
- " self.model.save_pretrained(\"./aggregated/model\")\n",
- " tokenizer.save_pretrained(\"./aggregated/tokenizer\")\n",
- " self.current_round += 1\n",
- " if self.current_round < self.rounds:\n",
- " self.next(\n",
- " self.aggregated_model_validation,\n",
- " foreach=\"collaborators\",\n",
- " exclude=[\"model\"],\n",
- " )\n",
- " else:\n",
- " self.next(self.end)\n",
- "\n",
- " @aggregator\n",
- " def end(self):\n",
- " \"\"\"\n",
- " End the federated learning process.\n",
- "\n",
- " This method marks the end of the federated learning process and performs any\n",
- " necessary cleanup or finalization steps.\n",
- " \"\"\"\n",
- " print(f\"This is the end of the flow\")\n"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "e120a656-f4a5-47a5-a3d4-62c5f3672bba",
- "metadata": {},
- "source": [
- "You'll notice in the `FederatedFlow` definition above that there were certain attributes that the flow was not initialized with, namely the `train_dataset` and `eval_dataset` for each of the collaborators. These are **private_attributes** that are exposed only through the runtime. Each participant has its own set of private attributes: a dictionary where the key is the attribute name, and the value is the object that will be made accessible through that participant's task.\n",
- "\n",
- "Below, we segment shards of the Math_10k dataset for **two collaborators**: Portland and Seattle. Each has their own slice of the dataset that's accessible via the `train_dataset` or `eval_dataset` attribute. Note that the private attributes are flexible, and you can choose to pass in a completely different type of object to any of the collaborators or aggregator (with an arbitrary name). These private attributes will always be filtered out of the current state when transferring from collaborator to aggregator, or vice versa."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e5e108c6-5150-4931-9c01-6b64a913fa04",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Setup participants\n",
- "_aggregator = Aggregator()\n",
- "_aggregator.private_attributes = {}\n",
- "\n",
- "# Setup collaborators with private attributes\n",
- "collaborator_names = [\n",
- " \"Portland\",\n",
- " \"Seattle\",\n",
- "]\n",
- "_collaborators = [Collaborator(name=name) for name in collaborator_names]\n",
- "\n",
- "for idx, current_collaborator in enumerate(_collaborators):\n",
- " # Set the private attributes of the Collaborator to include their specific training and testing data loaders\n",
- " current_collaborator.private_attributes = {\n",
- " \"train_dataset\": processed_train_dataset.shard(\n",
- " num_shards=len(_collaborators), index=idx\n",
- " ),\n",
- " \"eval_dataset\": processed_test_dataset.shard(\n",
- " num_shards=len(_collaborators), index=idx\n",
- " ),\n",
- " }\n",
- "\n",
- "local_runtime = LocalRuntime(\n",
- " aggregator=_aggregator, collaborators=_collaborators, backend=\"single_process\"\n",
- ")\n",
- "print(f\"Local runtime collaborators = {local_runtime.collaborators}\")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "9cb61fc0",
- "metadata": {},
- "source": [
- "## Run Experiment"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "38894111-41d9-4dd4-b1c8-eb7ec3cdd3e1",
- "metadata": {},
- "outputs": [],
- "source": [
- "flflow = FederatedFlow(model, rounds=2)\n",
- "flflow.runtime = local_runtime\n",
- "flflow.run()\n",
- "\n",
- "# Determine the final model accuracy:\n",
- "print(f'\\nFinal aggregated model accuracy for {flflow.rounds} rounds of training: {flflow.aggregated_model_accuracy}')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7bc8fe27",
- "metadata": {},
- "source": [
- "## 🎉 Congratulations! 🎉\n",
- "\n",
- "Now that you've completed this notebook, check out our [other tutorials](https://github.com/securefederatedai/openfl/tree/develop/openfl-tutorials/experimental/)\n",
- "\n",
- "- Using the LocalRuntime Ray Backend for dedicated GPU access\n",
- "- Vertical Federated Learning\n",
- "- Model Watermarking\n",
- "- Differential Privacy\n",
- "- And More!"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.8"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
+ " out
From 633c27dbbe2cd0e1076641f7000ca9f5f81e2d2b Mon Sep 17 00:00:00 2001
From: rajithkrishnegowda
<134698520+rajithkrishnegowda@users.noreply.github.com>
Date: Mon, 12 May 2025 15:11:17 +0530
Subject: [PATCH 18/34] Update phi-4-quanti.ipynb
---
openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
index 33281dabb0..960e79d1a5 100644
--- a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
+++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
@@ -395,4 +395,4 @@
" ),\n",
" )\n",
"\n",
- " out
+ "
From c7a9b42ecc2a4ee6b2d70a2fd9c8fc04aca2121a Mon Sep 17 00:00:00 2001
From: rajithkrishnegowda
<134698520+rajithkrishnegowda@users.noreply.github.com>
Date: Mon, 12 May 2025 15:11:50 +0530
Subject: [PATCH 19/34] Update phi-4-quanti.ipynb
---
.../workflow/LLM/phi-4-quanti.ipynb | 100 +-----------------
1 file changed, 1 insertion(+), 99 deletions(-)
diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
index 960e79d1a5..c76ecd5f51 100644
--- a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
+++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
@@ -297,102 +297,4 @@
" set_peft_model_state_dict(model, state_dict)\n",
" return model"
]
- },
- {
- "cell_type": "markdown",
- "id": "810eb75e",
- "metadata": {},
- "source": [
- "## Federated Learning Workflow with Quantization"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "58298e8e-ab9e-4377-966e-143823441697",
- "metadata": {},
- "outputs": [],
- "source": [
- "class FederatedFlow(FLSpec):\n",
- " def __init__(self, model=None, optimizer=None, rounds=3, **kwargs):\n",
- " super().__init__(**kwargs)\n",
- " if model is not None:\n",
- " self.model = model\n",
- " self.peft_params = get_peft_model_state_dict(self.model)\n",
- " self.optimizer = optimizer\n",
- " \n",
- " # Print memory usage\n",
- " print(f\"Model memory footprint: {self.model.get_memory_footprint() / 1024**2:.2f} MB\")\n",
- " \n",
- " # Print trainable parameters\n",
- " trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)\n",
- " total_params = sum(p.numel() for p in self.model.parameters())\n",
- " print(f\"Trainable params: {trainable_params} || All params: {total_params} || Trainable%: {100 * trainable_params / total_params:.2f}\")\n",
- " else:\n",
- " raise ValueError(\"No model inputted\")\n",
- "\n",
- " self.rounds = rounds\n",
- "\n",
- " @aggregator\n",
- " def start(self):\n",
- " print(f\"Performing initialization for model\")\n",
- " self.collaborators = self.runtime.collaborators\n",
- " self.current_round = 0\n",
- " self.next(self.aggregated_model_validation, foreach=\"collaborators\")\n",
- "\n",
- " @collaborator\n",
- " def aggregated_model_validation(self):\n",
- " print(f\"Performing aggregated model validation for collaborator {self.input}\")\n",
- " \n",
- " # Load model with 8-bit quantization\n",
- " self.model = AutoModelForCausalLM.from_pretrained(\n",
- " checkpoint_path, return_dict=True, **model_kwargs\n",
- " )\n",
- " self.model = get_peft_model(self.model, peft_conf)\n",
- " set_peft_model_state_dict(self.model, self.peft_params)\n",
- " \n",
- " trainer = SFTTrainer(\n",
- " model=self.model,\n",
- " args=train_conf,\n",
- " peft_config=peft_conf,\n",
- " train_dataset=self.train_dataset,\n",
- " eval_dataset=self.eval_dataset,\n",
- " max_seq_length=sequence_max_length,\n",
- " dataset_text_field=\"text\",\n",
- " tokenizer=tokenizer,\n",
- " packing=True,\n",
- " data_collator=transformers.DataCollatorForSeq2Seq(\n",
- " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n",
- " ),\n",
- " )\n",
- "\n",
- " trainer.remove_callback(PrinterCallback)\n",
- " out = trainer.evaluate()\n",
- " self.agg_validation_score = out[\"eval_loss\"]\n",
- " print(f\"{self.input} value of {self.agg_validation_score}\")\n",
- " self.next(self.train)\n",
- "\n",
- " @collaborator\n",
- " def train(self):\n",
- " print(f\"Training on collaborator {self.input}\")\n",
- " \n",
- " # Enable gradient checkpointing for memory efficiency\n",
- " self.model.gradient_checkpointing_enable()\n",
- " self.model.config.use_cache = False\n",
- " \n",
- " trainer = SFTTrainer(\n",
- " model=self.model,\n",
- " args=train_conf,\n",
- " peft_config=peft_conf,\n",
- " train_dataset=self.train_dataset,\n",
- " eval_dataset=self.eval_dataset,\n",
- " max_seq_length=sequence_max_length,\n",
- " dataset_text_field=\"text\",\n",
- " tokenizer=tokenizer,\n",
- " packing=True,\n",
- " data_collator=transformers.DataCollatorForSeq2Seq(\n",
- " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n",
- " ),\n",
- " )\n",
- "\n",
- "
+ }
From 4067c82e8f5f5f486010d51956434e1f5e607c3d Mon Sep 17 00:00:00 2001
From: rajithkrishnegowda
<134698520+rajithkrishnegowda@users.noreply.github.com>
Date: Mon, 12 May 2025 15:31:03 +0530
Subject: [PATCH 20/34] Update phi-4-quanti.ipynb
---
.../workflow/LLM/phi-4-quanti.ipynb | 405 +++++++++---------
1 file changed, 205 insertions(+), 200 deletions(-)
diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
index c76ecd5f51..8435e9e418 100644
--- a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
+++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
@@ -5,7 +5,7 @@
"id": "a59f475d-d843-46bc-b75e-10984b687ed3",
"metadata": {},
"source": [
- "# Federated Fine-Tuning of Phi-4 Using OpenFL with 8-bit Quantization"
+ "# Federated Fine-Tuning of Phi-4 with 8-bit Quantization"
]
},
{
@@ -13,15 +13,7 @@
"id": "20c74cb9-51a2-42e2-893f-d280e227e8bf",
"metadata": {},
"source": [
- "In this tutorial, we demonstrate how to fine-tune Microsoft's Phi-4 model in a federated learning workflow with 8-bit quantization."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "d07c32d3-1a8d-4162-af45-bc3a10e0ae3f",
- "metadata": {},
- "source": [
- "## Installation"
+ "This notebook demonstrates federated fine-tuning of Microsoft's Phi-4 model with 8-bit quantization using OpenFL."
]
},
{
@@ -34,14 +26,6 @@
"!pip install torch transformers peft datasets trl==0.12.2 bitsandbytes accelerate -q"
]
},
- {
- "cell_type": "markdown",
- "id": "440a9c39-ec42-45a5-80f6-9a9e0bc90d2f",
- "metadata": {},
- "source": [
- "## Import Libraries"
- ]
- },
{
"cell_type": "code",
"execution_count": null,
@@ -49,33 +33,23 @@
"metadata": {},
"outputs": [],
"source": [
- "import hashlib\n",
- "import os\n",
- "\n",
- "import numpy as np\n",
- "import requests\n",
"import torch\n",
"import transformers\n",
- "from datasets import load_dataset\n",
+ "from transformers import (\n",
+ " AutoModelForCausalLM,\n",
+ " AutoTokenizer,\n",
+ " BitsAndBytesConfig,\n",
+ " TrainingArguments\n",
+ ")\n",
"from peft import LoraConfig, get_peft_model\n",
- "from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict\n",
- "from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig\n",
- "from transformers.trainer_callback import PrinterCallback\n",
+ "from datasets import load_dataset\n",
"from trl import SFTTrainer\n",
- "\n",
+ "import numpy as np\n",
"from openfl.experimental.workflow.interface import Aggregator, Collaborator, FLSpec\n",
"from openfl.experimental.workflow.placement import aggregator, collaborator\n",
"from openfl.experimental.workflow.runtime import LocalRuntime"
]
},
- {
- "cell_type": "markdown",
- "id": "08576aa0-f628-4ae6-8fc3-dd167d164784",
- "metadata": {},
- "source": [
- "## Dataset Preparation"
- ]
- },
{
"cell_type": "code",
"execution_count": null,
@@ -83,218 +57,249 @@
"metadata": {},
"outputs": [],
"source": [
- "def file_checksum(file_path, algorithm=\"sha256\"):\n",
- " hash_func = hashlib.new(algorithm)\n",
- " with open(file_path, \"rb\") as f:\n",
- " for chunk in iter(lambda: f.read(4096), b\"\"):\n",
- " hash_func.update(chunk)\n",
- " return hash_func.hexdigest()\n",
+ "# 8-bit quantization config\n",
+ "quant_config = BitsAndBytesConfig(\n",
+ " load_in_8bit=True,\n",
+ " llm_int8_threshold=6.0,\n",
+ " llm_int8_skip_modules=None,\n",
+ " llm_int8_enable_fp32_cpu_offload=False,\n",
+ " llm_int8_has_fp16_weight=False\n",
+ ")\n",
"\n",
- "if not os.path.exists(\"math_10k.json\"):\n",
- " r = requests.get(\n",
- " \"https://raw.githubusercontent.com/AGI-Edgerunners/LLM-Adapters/main/ft-training_set/math_10k.json\",\n",
- " )\n",
- " with open(\"math_10k.json\", \"wb\") as f:\n",
- " f.write(r.content)\n",
+ "# Model config\n",
+ "model_kwargs = {\n",
+ " \"quantization_config\": quant_config,\n",
+ " \"device_map\": \"auto\",\n",
+ " \"trust_remote_code\": True\n",
+ "}\n",
"\n",
- " actual_checksum = file_checksum(\"math_10k.json\")\n",
- " if actual_checksum != \"0342d0d860ad8592b579329337c90e42eefd3d9f2898043140cbd120630418b8\":\n",
- " raise ValueError(\"Checksum verification failed. The file may have been altered.\")\n",
+ "# PEFT config\n",
+ "peft_config = LoraConfig(\n",
+ " r=8,\n",
+ " lora_alpha=16,\n",
+ " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\"],\n",
+ " lora_dropout=0.05,\n",
+ " bias=\"none\",\n",
+ " task_type=\"CAUSAL_LM\"\n",
+ ")\n",
"\n",
- "raw_dataset = load_dataset(\"json\", data_files=\"math_10k.json\")"
+ "# Training config\n",
+ "training_config = TrainingArguments(\n",
+ " output_dir=\"./results\",\n",
+ " per_device_train_batch_size=2,\n",
+ " per_device_eval_batch_size=2,\n",
+ " gradient_accumulation_steps=4,\n",
+ " learning_rate=2e-5,\n",
+ " logging_steps=10,\n",
+ " num_train_epochs=1,\n",
+ " max_grad_norm=0.3,\n",
+ " warmup_ratio=0.03,\n",
+ " lr_scheduler_type=\"cosine\",\n",
+ " save_steps=100,\n",
+ " fp16=True,\n",
+ " optim=\"adamw_torch\",\n",
+ " report_to=\"none\"\n",
+ ")"
]
},
{
- "cell_type": "markdown",
- "id": "3ab15ad6-db35-4a58-a2d5-54a6d3ccdc78",
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8ab371f1-64c3-4225-82e7-fb3c5b05578c",
"metadata": {},
+ "outputs": [],
"source": [
- "## Configuration with 8-bit Quantization"
+ "# Load model and tokenizer\n",
+ "model = AutoModelForCausalLM.from_pretrained(\"microsoft/phi-2\", **model_kwargs)\n",
+ "model = get_peft_model(model, peft_config)\n",
+ "tokenizer = AutoTokenizer.from_pretrained(\"microsoft/phi-2\")\n",
+ "tokenizer.pad_token = tokenizer.eos_token"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "eada9809-468a-47c6-9b03-55aa887c9487",
+ "id": "4392ddab-10b7-41f6-a8e0-65ba298ea457",
"metadata": {},
"outputs": [],
"source": [
- "# 8-bit quantization config\n",
- "quantization_config = BitsAndBytesConfig(\n",
- " load_in_8bit=True,\n",
- " llm_int8_threshold=6.0,\n",
- " llm_int8_has_fp16_weight=False,\n",
- ")\n",
+ "# Dataset preparation\n",
+ "def format_instruction(sample):\n",
+ " return f\"\"\"### Instruction:\n",
+ "{sample['instruction']}\n",
"\n",
- "training_config = {\n",
- " \"bf16\": True,\n",
- " \"use_cpu\": False,\n",
- " \"do_eval\": False,\n",
- " \"learning_rate\": 5.0e-06,\n",
- " \"log_level\": \"info\",\n",
- " \"logging_steps\": 20,\n",
- " \"lr_scheduler_type\": \"cosine\",\n",
- " \"num_train_epochs\": 1,\n",
- " \"output_dir\": \"./checkpoint_dir\",\n",
- " \"overwrite_output_dir\": True,\n",
- " \"per_device_eval_batch_size\": 1,\n",
- " \"per_device_train_batch_size\": 1,\n",
- " \"save_steps\": 100,\n",
- " \"save_total_limit\": 1,\n",
- " \"seed\": 0,\n",
- " \"gradient_checkpointing\": True,\n",
- " \"gradient_checkpointing_kwargs\": {\"use_reentrant\": False},\n",
- " \"warmup_ratio\": 0.2,\n",
- " \"optim\": \"adamw_8bit\", # Special 8-bit optimizer\n",
- "}\n",
+ "### Input:\n",
+ "{sample['input']}\n",
"\n",
- "peft_config = {\n",
- " \"r\": 8,\n",
- " \"lora_alpha\": 16,\n",
- " \"lora_dropout\": 0.1,\n",
- " \"bias\": \"none\",\n",
- " \"task_type\": \"CAUSAL_LM\",\n",
- " \"target_modules\": [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
- " \"modules_to_save\": [\"lm_head\", \"embed_tokens\"],\n",
- "}\n",
+ "### Response:\n",
+ "{sample['output']}\"\"\"\n",
"\n",
- "model_kwargs = dict(\n",
- " use_cache=False,\n",
- " trust_remote_code=True,\n",
- " quantization_config=quantization_config,\n",
- " device_map=\"auto\",\n",
- ")\n",
+ "dataset = load_dataset(\"json\", data_files=\"math_10k.json\")[\"train\"].train_test_split(test_size=0.1)\n",
+ "train_data = dataset[\"train\"].shuffle().select(range(100))\n",
+ "val_data = dataset[\"test\"].shuffle().select(range(20))\n",
"\n",
- "train_conf = TrainingArguments(**training_config)\n",
- "peft_conf = LoraConfig(**peft_config)"
+ "train_data = train_data.map(lambda x: {\"text\": format_instruction(x)})\n",
+ "val_data = val_data.map(lambda x: {\"text\": format_instruction(x)})"
]
},
{
- "cell_type": "markdown",
- "id": "ffe93234-2a1a-4809-a431-efe2f35ce496",
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6dc85c57-68b2-4514-9373-43e3d7c05c10",
"metadata": {},
+ "outputs": [],
"source": [
- "## Load Quantized Model"
+ "class FederatedFlow(FLSpec):\n",
+ " def __init__(self, model=None, rounds=3, **kwargs):\n",
+ " super().__init__(**kwargs)\n",
+ " self.model = model\n",
+ " self.rounds = rounds\n",
+ " self.training_metrics = []\n",
+ " \n",
+ " @aggregator\n",
+ " def start(self):\n",
+ " print(\"Starting federated training\")\n",
+ " self.collaborators = self.runtime.collaborators\n",
+ " self.current_round = 0\n",
+ " self.next(self.aggregated_model_validation, foreach=\"collaborators\")\n",
+ " \n",
+ " @collaborator\n",
+ " def aggregated_model_validation(self):\n",
+ " trainer = SFTTrainer(\n",
+ " model=self.model,\n",
+ " args=training_config,\n",
+ " train_dataset=self.train_data,\n",
+ " eval_dataset=self.val_data,\n",
+ " dataset_text_field=\"text\",\n",
+ " max_seq_length=512,\n",
+ " tokenizer=tokenizer\n",
+ " )\n",
+ " metrics = trainer.evaluate()\n",
+ " self.validation_loss = metrics[\"eval_loss\"]\n",
+ " self.next(self.train)\n",
+ " \n",
+ " @collaborator\n",
+ " def train(self):\n",
+ " trainer = SFTTrainer(\n",
+ " model=self.model,\n",
+ " args=training_config,\n",
+ " train_dataset=self.train_data,\n",
+ " eval_dataset=self.val_data,\n",
+ " dataset_text_field=\"text\",\n",
+ " max_seq_length=512,\n",
+ " tokenizer=tokenizer\n",
+ " )\n",
+ " train_result = trainer.train()\n",
+ " self.training_loss = train_result.training_loss\n",
+ " self.training_metrics.append({\n",
+ " \"round\": self.current_round,\n",
+ " \"loss\": self.training_loss,\n",
+ " \"collaborator\": self.input\n",
+ " })\n",
+ " self.next(self.local_model_validation)\n",
+ " \n",
+ " @collaborator\n",
+ " def local_model_validation(self):\n",
+ " trainer = SFTTrainer(\n",
+ " model=self.model,\n",
+ " args=training_config,\n",
+ " train_dataset=self.train_data,\n",
+ " eval_dataset=self.val_data,\n",
+ " dataset_text_field=\"text\",\n",
+ " max_seq_length=512,\n",
+ " tokenizer=tokenizer\n",
+ " )\n",
+ " metrics = trainer.evaluate()\n",
+ " self.local_validation_loss = metrics[\"eval_loss\"]\n",
+ " self.next(self.join, exclude=[\"model\"])\n",
+ " \n",
+ " @aggregator\n",
+ " def join(self, inputs):\n",
+ " avg_loss = sum(input.training_loss for input in inputs) / len(inputs)\n",
+ " avg_val_loss = sum(input.validation_loss for input in inputs) / len(inputs)\n",
+ " \n",
+ " print(f\"Round {self.current_round} - Avg Training Loss: {avg_loss:.4f}\")\n",
+ " print(f\"Round {self.current_round} - Avg Validation Loss: {avg_val_loss:.4f}\")\n",
+ " \n",
+ " # Aggregate model updates\n",
+ " self.current_round += 1\n",
+ " if self.current_round < self.rounds:\n",
+ " self.next(self.aggregated_model_validation, foreach=\"collaborators\")\n",
+ " else:\n",
+ " self.next(self.end)\n",
+ " \n",
+ " @aggregator\n",
+ " def end(self):\n",
+ " print(\"Training complete!\")\n",
+ " print(\"Final Training Metrics:\")\n",
+ " for metric in self.training_metrics:\n",
+ " print(f\"Round {metric['round']} - {metric['collaborator']} - Loss: {metric['loss']:.4f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "8ab371f1-64c3-4225-82e7-fb3c5b05578c",
+ "id": "e5e108c6-5150-4931-9c01-6b64a913fa04",
"metadata": {},
"outputs": [],
"source": [
- "checkpoint_path = \"microsoft/phi-1_5\"\n",
- "model = AutoModelForCausalLM.from_pretrained(\n",
- " checkpoint_path, return_dict=True, **model_kwargs\n",
- ")\n",
- "model = get_peft_model(model, peft_conf)\n",
+ "# Setup runtime\n",
+ "aggregator = Aggregator()\n",
+ "collaborators = [\n",
+ " Collaborator(name=\"Portland\", private_attributes={\"train_data\": train_data.shard(2, 0), \"val_data\": val_data.shard(2, 0)}),\n",
+ " Collaborator(name=\"Seattle\", private_attributes={\"train_data\": train_data.shard(2, 1), \"val_data\": val_data.shard(2, 1)})\n",
+ "]\n",
"\n",
- "tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)\n",
- "sequence_max_length = 512\n",
- "val_set_size = 2000\n",
- "tokenizer.pad_token_id = 0\n",
- "tokenizer.padding_side = \"left\""
- ]
- },
- {
- "cell_type": "markdown",
- "id": "dd058fff-f6dd-4cc6-acaf-7e2fa2c1132d",
- "metadata": {},
- "source": [
- "## Dataset Preprocessing"
+ "runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators, backend=\"single_process\")"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "4392ddab-10b7-41f6-a8e0-65ba298ea457",
+ "id": "38894111-41d9-4dd4-b1c8-eb7ec3cdd3e1",
"metadata": {},
"outputs": [],
"source": [
- "def generate_prompt(data_point):\n",
- " if data_point[\"input\"]:\n",
- " return f\"\"\"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. \n",
- "\n",
- " ### Instruction:\n",
- " {data_point[\"instruction\"]}\n",
- " \n",
- " ### Input:\n",
- " {data_point[\"input\"]}\n",
- " \n",
- " ### Response:\n",
- " {data_point[\"output\"]}\"\"\"\n",
- " else:\n",
- " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request. \n",
- "\n",
- " ### Instruction:\n",
- " {data_point[\"instruction\"]}\n",
- " \n",
- " ### Response:\n",
- " {data_point[\"output\"]}\"\"\"\n",
- "\n",
- "def tokenize(prompt, add_eos_token=True):\n",
- " result = tokenizer(\n",
- " prompt,\n",
- " truncation=True,\n",
- " max_length=sequence_max_length,\n",
- " padding=False,\n",
- " return_tensors=None,\n",
- " )\n",
- " if (\n",
- " result[\"input_ids\"][-1] != tokenizer.eos_token_id\n",
- " and len(result[\"input_ids\"]) < sequence_max_length\n",
- " and add_eos_token\n",
- " ):\n",
- " result[\"input_ids\"].append(tokenizer.eos_token_id)\n",
- " result[\"attention_mask\"].append(1)\n",
- "\n",
- " result[\"labels\"] = result[\"input_ids\"].copy()\n",
- " return result\n",
- "\n",
- "def generate_and_tokenize_prompt(data_point):\n",
- " full_prompt = generate_prompt(data_point)\n",
- " tokenized_full_prompt = tokenize(full_prompt)\n",
- " user_prompt = generate_prompt({**data_point, \"output\": \"\"})\n",
- " tokenized_user_prompt = tokenize(user_prompt, add_eos_token=False)\n",
- " user_prompt_len = len(tokenized_user_prompt[\"input_ids\"])\n",
- "\n",
- " tokenized_full_prompt[\"labels\"] = [-100] * user_prompt_len + tokenized_full_prompt[\"labels\"][user_prompt_len:]\n",
- " return tokenized_full_prompt\n",
- "\n",
- "train_val = raw_dataset[\"train\"].train_test_split(test_size=val_set_size, shuffle=True, seed=42)\n",
- "processed_train_dataset = train_val[\"train\"].shuffle().map(generate_and_tokenize_prompt).select(range(3))\n",
- "processed_test_dataset = train_val[\"test\"].shuffle().map(generate_and_tokenize_prompt).select(range(3))"
+ "# Run training\n",
+ "flow = FederatedFlow(model, rounds=2)\n",
+ "flow.runtime = runtime\n",
+ "flow.run()"
]
},
{
"cell_type": "markdown",
- "id": "812cfcc8-33ec-4a2b-8a74-27bfc2a41d7b",
+ "id": "7bc8fe27",
"metadata": {},
"source": [
- "## Federated Averaging with Quantization Support"
+ "## Key Features:\n",
+ "\n",
+ "1. **8-bit Quantization**: Enabled through BitsAndBytesConfig\n",
+ "2. **Enhanced Training Metrics**: Tracks and reports loss at each round\n",
+ "3. **PEFT with LoRA**: Parameter-efficient fine-tuning configuration\n",
+ "4. **Memory Optimization**: 8-bit weights and gradient accumulation\n",
+ "5. **Validation Tracking**: Separate validation before and after training"
]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
},
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "6dc85c57-68b2-4514-9373-43e3d7c05c10",
- "metadata": {},
- "outputs": [],
- "source": [
- "def FedAvg(peft_params, model, weights=None):\n",
- " state_dicts = peft_params\n",
- " state_dict = get_peft_model_state_dict(model)\n",
- " for key in peft_params[0]:\n",
- " dtype = state_dicts[0][key].dtype\n",
- " state_dict[key] = torch.from_numpy(\n",
- " np.average(\n",
- " [state[key].to(torch.float).numpy() for state in state_dicts], axis=0, weights=weights\n",
- " )\n",
- " ).to(dtype)\n",
- " \n",
- " # Handle quantization when setting state dict\n",
- " set_peft_model_state_dict(model, state_dict)\n",
- " return model"
- ]
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.8"
}
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
From a3b71a5b57dac5083efed6846f5e34e260faf9aa Mon Sep 17 00:00:00 2001
From: rajithkrishnegowda
<134698520+rajithkrishnegowda@users.noreply.github.com>
Date: Mon, 12 May 2025 15:31:56 +0530
Subject: [PATCH 21/34] Update phi-4-quanti.ipynb
---
openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
index 8435e9e418..7a53e6fceb 100644
--- a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
+++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
@@ -110,9 +110,9 @@
"outputs": [],
"source": [
"# Load model and tokenizer\n",
- "model = AutoModelForCausalLM.from_pretrained(\"microsoft/phi-2\", **model_kwargs)\n",
+ "model = AutoModelForCausalLM.from_pretrained(\"microsoft/phi-4\", **model_kwargs)\n",
"model = get_peft_model(model, peft_config)\n",
- "tokenizer = AutoTokenizer.from_pretrained(\"microsoft/phi-2\")\n",
+ "tokenizer = AutoTokenizer.from_pretrained(\"microsoft/phi-4\")\n",
"tokenizer.pad_token = tokenizer.eos_token"
]
},
From 0380c355235cb7d55fca1380b73232888ab7f4cf Mon Sep 17 00:00:00 2001
From: rajithkrishnegowda
<134698520+rajithkrishnegowda@users.noreply.github.com>
Date: Mon, 12 May 2025 15:52:08 +0530
Subject: [PATCH 22/34] Update phi-4-quanti.ipynb
---
.../workflow/LLM/phi-4-quanti.ipynb | 63 ++++++++-----------
1 file changed, 26 insertions(+), 37 deletions(-)
diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
index 7a53e6fceb..d7cc2b8616 100644
--- a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
+++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
@@ -13,7 +13,7 @@
"id": "20c74cb9-51a2-42e2-893f-d280e227e8bf",
"metadata": {},
"source": [
- "This notebook demonstrates federated fine-tuning of Microsoft's Phi-4 model with 8-bit quantization using OpenFL."
+ "This notebook demonstrates federated fine-tuning of Microsoft's Phi-4 model (4B parameters) with 8-bit quantization using OpenFL."
]
},
{
@@ -57,7 +57,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# 8-bit quantization config\n",
+ "# 8-bit quantization config for Phi-4\n",
"quant_config = BitsAndBytesConfig(\n",
" load_in_8bit=True,\n",
" llm_int8_threshold=6.0,\n",
@@ -66,37 +66,38 @@
" llm_int8_has_fp16_weight=False\n",
")\n",
"\n",
- "# Model config\n",
+ "# Model config for Phi-4\n",
"model_kwargs = {\n",
" \"quantization_config\": quant_config,\n",
" \"device_map\": \"auto\",\n",
- " \"trust_remote_code\": True\n",
+ " \"trust_remote_code\": True,\n",
+ " \"torch_dtype\": torch.bfloat16\n",
"}\n",
"\n",
- "# PEFT config\n",
+ "# PEFT config optimized for Phi-4\n",
"peft_config = LoraConfig(\n",
- " r=8,\n",
- " lora_alpha=16,\n",
- " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\"],\n",
+ " r=16, # Higher rank for larger model\n",
+ " lora_alpha=32,\n",
+ " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
" lora_dropout=0.05,\n",
" bias=\"none\",\n",
" task_type=\"CAUSAL_LM\"\n",
")\n",
"\n",
- "# Training config\n",
+ "# Training config adjusted for Phi-4\n",
"training_config = TrainingArguments(\n",
" output_dir=\"./results\",\n",
- " per_device_train_batch_size=2,\n",
- " per_device_eval_batch_size=2,\n",
- " gradient_accumulation_steps=4,\n",
- " learning_rate=2e-5,\n",
+ " per_device_train_batch_size=1, # Reduced for 4B model\n",
+ " per_device_eval_batch_size=1,\n",
+ " gradient_accumulation_steps=8, # Increased for memory efficiency\n",
+ " learning_rate=1e-5, # Lower learning rate for larger model\n",
" logging_steps=10,\n",
" num_train_epochs=1,\n",
" max_grad_norm=0.3,\n",
" warmup_ratio=0.03,\n",
" lr_scheduler_type=\"cosine\",\n",
" save_steps=100,\n",
- " fp16=True,\n",
+ " bf16=True, # Using bfloat16 for Phi-4\n",
" optim=\"adamw_torch\",\n",
" report_to=\"none\"\n",
")"
@@ -109,11 +110,12 @@
"metadata": {},
"outputs": [],
"source": [
- "# Load model and tokenizer\n",
+ "# Load Phi-4 model and tokenizer\n",
"model = AutoModelForCausalLM.from_pretrained(\"microsoft/phi-4\", **model_kwargs)\n",
"model = get_peft_model(model, peft_config)\n",
"tokenizer = AutoTokenizer.from_pretrained(\"microsoft/phi-4\")\n",
- "tokenizer.pad_token = tokenizer.eos_token"
+ "tokenizer.pad_token = tokenizer.eos_token\n",
+ "tokenizer.padding_side = \"left\""
]
},
{
@@ -123,7 +125,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# Dataset preparation\n",
+ "# Dataset preparation for Phi-4\n",
"def format_instruction(sample):\n",
" return f\"\"\"### Instruction:\n",
"{sample['instruction']}\n",
@@ -135,8 +137,8 @@
"{sample['output']}\"\"\"\n",
"\n",
"dataset = load_dataset(\"json\", data_files=\"math_10k.json\")[\"train\"].train_test_split(test_size=0.1)\n",
- "train_data = dataset[\"train\"].shuffle().select(range(100))\n",
- "val_data = dataset[\"test\"].shuffle().select(range(20))\n",
+ "train_data = dataset[\"train\"].shuffle().select(range(50)) # Smaller subset for Phi-4\n",
+ "val_data = dataset[\"test\"].shuffle().select(range(10))\n",
"\n",
"train_data = train_data.map(lambda x: {\"text\": format_instruction(x)})\n",
"val_data = val_data.map(lambda x: {\"text\": format_instruction(x)})"
@@ -158,7 +160,7 @@
" \n",
" @aggregator\n",
" def start(self):\n",
- " print(\"Starting federated training\")\n",
+ " print(\"Starting federated training for Phi-4\")\n",
" self.collaborators = self.runtime.collaborators\n",
" self.current_round = 0\n",
" self.next(self.aggregated_model_validation, foreach=\"collaborators\")\n",
@@ -230,7 +232,7 @@
" \n",
" @aggregator\n",
" def end(self):\n",
- " print(\"Training complete!\")\n",
+ " print(\"Phi-4 Training complete!\")\n",
" print(\"Final Training Metrics:\")\n",
" for metric in self.training_metrics:\n",
" print(f\"Round {metric['round']} - {metric['collaborator']} - Loss: {metric['loss']:.4f}\")"
@@ -243,7 +245,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# Setup runtime\n",
+ "# Setup runtime for Phi-4\n",
"aggregator = Aggregator()\n",
"collaborators = [\n",
" Collaborator(name=\"Portland\", private_attributes={\"train_data\": train_data.shard(2, 0), \"val_data\": val_data.shard(2, 0)}),\n",
@@ -260,26 +262,13 @@
"metadata": {},
"outputs": [],
"source": [
- "# Run training\n",
+ "# Run training for Phi-4\n",
"flow = FederatedFlow(model, rounds=2)\n",
"flow.runtime = runtime\n",
"flow.run()"
]
},
- {
- "cell_type": "markdown",
- "id": "7bc8fe27",
- "metadata": {},
- "source": [
- "## Key Features:\n",
- "\n",
- "1. **8-bit Quantization**: Enabled through BitsAndBytesConfig\n",
- "2. **Enhanced Training Metrics**: Tracks and reports loss at each round\n",
- "3. **PEFT with LoRA**: Parameter-efficient fine-tuning configuration\n",
- "4. **Memory Optimization**: 8-bit weights and gradient accumulation\n",
- "5. **Validation Tracking**: Separate validation before and after training"
- ]
- }
+
],
"metadata": {
"kernelspec": {
From 0335de7ae96f6441f3129e38e79078028091171b Mon Sep 17 00:00:00 2001
From: rajithkrishnegowda
<134698520+rajithkrishnegowda@users.noreply.github.com>
Date: Mon, 12 May 2025 15:52:51 +0530
Subject: [PATCH 23/34] Update phi-4-quanti.ipynb
---
.../workflow/LLM/phi-4-quanti.ipynb | 41 ++++++++++---------
1 file changed, 21 insertions(+), 20 deletions(-)
diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
index d7cc2b8616..9780a0d63a 100644
--- a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
+++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
@@ -268,26 +268,27 @@
"flow.run()"
]
},
-
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.8"
- }
+ {
+ "cell_type": "markdown",
+ "id": "7bc8fe27",
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.8"
+ }
},
"nbformat": 4,
"nbformat_minor": 5
From 7bb70cfc478d6d9655a6ec5ad4002d7babfb2bed Mon Sep 17 00:00:00 2001
From: rajithkrishnegowda
<134698520+rajithkrishnegowda@users.noreply.github.com>
Date: Mon, 12 May 2025 15:53:13 +0530
Subject: [PATCH 24/34] Update phi-4-quanti.ipynb
---
.../workflow/LLM/phi-4-quanti.ipynb | 41 +++++++++++--------
1 file changed, 23 insertions(+), 18 deletions(-)
diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
index 9780a0d63a..59ae25d98a 100644
--- a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
+++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
@@ -271,24 +271,29 @@
{
"cell_type": "markdown",
"id": "7bc8fe27",
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.8"
- }
+ "metadata": {},
+ "source": [
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.8"
+ }
},
"nbformat": 4,
"nbformat_minor": 5
From 02f194248a410fac9cda6668b689dccdcb3f4a30 Mon Sep 17 00:00:00 2001
From: rajithkrishnegowda
<134698520+rajithkrishnegowda@users.noreply.github.com>
Date: Thu, 15 May 2025 18:02:27 +0530
Subject: [PATCH 25/34] Add files via upload
---
.../experimental/workflow/LLM/phi-4-sol.ipynb | 438 ++++++++++++++++++
1 file changed, 438 insertions(+)
create mode 100644 openfl-tutorials/experimental/workflow/LLM/phi-4-sol.ipynb
diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-sol.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-sol.ipynb
new file mode 100644
index 0000000000..b1ca1bac6e
--- /dev/null
+++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-sol.ipynb
@@ -0,0 +1,438 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "a59f475d-d843-46bc-b75e-10984b687ed3",
+ "metadata": {},
+ "source": [
+ "# Enhanced Federated Fine-Tuning of Phi-4 Using OpenFL with PEFT & Quantization"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "20c74cb9-51a2-42e2-893f-d280e227e8bf",
+ "metadata": {},
+ "source": [
+ "In this tutorial, we demonstrate how to fine-tune Microsoft's Phi-4 model in a federated learning workflow with enhanced local training using:\n",
+ "- Parameter-Efficient Fine-Tuning (PEFT)\n",
+ "- 4-bit Quantization (QLoRA)\n",
+ "- Gradient Checkpointing\n",
+ "- Optimized Training Configuration"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d07c32d3-1a8d-4162-af45-bc3a10e0ae3f",
+ "metadata": {},
+ "source": [
+ "## Installation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "05b2ad75-8c7b-499c-902e-dbd5b24361bc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install torch transformers peft datasets trl==0.12.2 bitsandbytes accelerate -q"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "440a9c39-ec42-45a5-80f6-9a9e0bc90d2f",
+ "metadata": {},
+ "source": [
+ "## Import Libraries"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "be4690ae-0671-4d3a-8f21-620ab865a03e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import torch\n",
+ "from transformers import (\n",
+ " AutoModelForCausalLM,\n",
+ " AutoTokenizer,\n",
+ " BitsAndBytesConfig,\n",
+ " TrainingArguments\n",
+ ")\n",
+ "from peft import (\n",
+ " LoraConfig,\n",
+ " get_peft_model,\n",
+ " prepare_model_for_kbit_training,\n",
+ " PeftModel\n",
+ ")\n",
+ "from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict # Added this import\n",
+ "from datasets import load_dataset\n",
+ "from trl import SFTTrainer\n",
+ "from openfl.experimental.workflow import FLSpec, Aggregator, Collaborator, LocalRuntime\n",
+ "import numpy as np"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "08576aa0-f628-4ae6-8fc3-dd167d164784",
+ "metadata": {},
+ "source": [
+ "## Configuration"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "eada9809-468a-47c6-9b03-55aa887c9487",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Model and dataset\n",
+ "model_name = \"microsoft/phi-4\"\n",
+ "dataset_name = \"math_10k.json\"\n",
+ "\n",
+ "# QLoRA configuration\n",
+ "bnb_config = BitsAndBytesConfig(\n",
+ " load_in_4bit=True,\n",
+ " bnb_4bit_quant_type=\"nf4\",\n",
+ " bnb_4bit_compute_dtype=torch.bfloat16,\n",
+ " bnb_4bit_use_double_quant=True,\n",
+ ")\n",
+ "\n",
+ "# LoRA configuration\n",
+ "peft_config = LoraConfig(\n",
+ " r=16, # Increased from original for better adaptation\n",
+ " lora_alpha=32,\n",
+ " lora_dropout=0.05,\n",
+ " bias=\"none\",\n",
+ " task_type=\"CAUSAL_LM\",\n",
+ " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"dense\"],\n",
+ ")\n",
+ "\n",
+ "# Training configuration\n",
+ "training_args = TrainingArguments(\n",
+ " output_dir=\"./results\",\n",
+ " num_train_epochs=1,\n",
+ " per_device_train_batch_size=1, # Reduced for Phi-4\n",
+ " gradient_accumulation_steps=2,\n",
+ " optim=\"paged_adamw_32bit\",\n",
+ " save_steps=100,\n",
+ " logging_steps=10,\n",
+ " learning_rate=2e-4,\n",
+ " weight_decay=0.001,\n",
+ " fp16=False,\n",
+ " bf16=True,\n",
+ " max_grad_norm=0.3,\n",
+ " warmup_ratio=0.03,\n",
+ " lr_scheduler_type=\"cosine\",\n",
+ " gradient_checkpointing=True,\n",
+ " report_to=\"none\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ffe93234-2a1a-4809-a431-efe2f35ce496",
+ "metadata": {},
+ "source": [
+ "## Load and Prepare Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8ab371f1-64c3-4225-82e7-fb3c5b05578c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Load tokenizer\n",
+ "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
+ "tokenizer.pad_token = tokenizer.eos_token\n",
+ "tokenizer.padding_side = \"right\"\n",
+ "\n",
+ "# Load model with quantization\n",
+ "model = AutoModelForCausalLM.from_pretrained(\n",
+ " model_name,\n",
+ " quantization_config=bnb_config,\n",
+ " device_map=\"auto\",\n",
+ " trust_remote_code=True\n",
+ ")\n",
+ "\n",
+ "# Prepare model for k-bit training\n",
+ "model = prepare_model_for_kbit_training(model)\n",
+ "\n",
+ "# Apply LoRA\n",
+ "model = get_peft_model(model, peft_config)\n",
+ "model.print_trainable_parameters()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dd058fff-f6dd-4cc6-acaf-7e2fa2c1132d",
+ "metadata": {},
+ "source": [
+ "## Load and Prepare Dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4392ddab-10b7-41f6-a8e0-65ba298ea457",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def format_prompt(example):\n",
+ " if example[\"input\"]:\n",
+ " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n",
+ "\n",
+ "### Instruction:\n",
+ "{example['instruction']}\n",
+ "\n",
+ "### Input:\n",
+ "{example['input']}\n",
+ "\n",
+ "### Response:\n",
+ "{example['output']}\"\"\"\n",
+ " else:\n",
+ " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n",
+ "\n",
+ "### Instruction:\n",
+ "{example['instruction']}\n",
+ "\n",
+ "### Response:\n",
+ "{example['output']}\"\"\"\n",
+ "\n",
+ "# Load dataset\n",
+ "dataset = load_dataset(\"json\", data_files=dataset_name, split=\"train\")\n",
+ "dataset = dataset.map(lambda x: {\"text\": format_prompt(x)})\n",
+ "\n",
+ "# Split dataset\n",
+ "dataset = dataset.train_test_split(test_size=0.1)\n",
+ "train_dataset = dataset[\"train\"]\n",
+ "eval_dataset = dataset[\"test\"]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "812cfcc8-33ec-4a2b-8a74-27bfc2a41d7b",
+ "metadata": {},
+ "source": [
+ "## Enhanced Training with SFTTrainer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6dc85c57-68b2-4514-9373-43e3d7c05c10",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trainer = SFTTrainer(\n",
+ " model=model,\n",
+ " train_dataset=train_dataset,\n",
+ " eval_dataset=eval_dataset,\n",
+ " peft_config=peft_config,\n",
+ " dataset_text_field=\"text\",\n",
+ " max_seq_length=1024,\n",
+ " tokenizer=tokenizer,\n",
+ " args=training_args,\n",
+ " packing=True,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "810eb75e",
+ "metadata": {},
+ "source": [
+ "## Federated Averaging Function"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "58298e8e-ab9e-4377-966e-143823441697",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def FedAvg(peft_params, model, weights=None):\n",
+ " \"\"\"\n",
+ " Perform Federated Averaging (FedAvg) on the model parameters.\n",
+ " \"\"\"\n",
+ " state_dicts = peft_params\n",
+ " state_dict = get_peft_model_state_dict(model)\n",
+ " for key in peft_params[0]:\n",
+ " dtype = state_dicts[0][key].dtype\n",
+ " state_dict[key] = torch.from_numpy(\n",
+ " np.average(\n",
+ " [state[key].to(torch.float).numpy() for state in state_dicts], \n",
+ " axis=0, \n",
+ " weights=weights\n",
+ " )\n",
+ " ).to(dtype)\n",
+ " set_peft_model_state_dict(model, state_dict)\n",
+ " return model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e120a656-f4a5-47a5-a3d4-62c5f3672bba",
+ "metadata": {},
+ "source": [
+ "## Federated Learning Workflow"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e5e108c6-5150-4931-9c01-6b64a913fa04",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class FederatedFlow(FLSpec):\n",
+ " def __init__(self, model=None, rounds=3, **kwargs):\n",
+ " super().__init__(**kwargs)\n",
+ " if model is not None:\n",
+ " self.model = model\n",
+ " self.peft_params = get_peft_model_state_dict(self.model)\n",
+ " else:\n",
+ " raise ValueError(\"No model provided\")\n",
+ " \n",
+ " self.rounds = rounds\n",
+ " \n",
+ " @aggregator\n",
+ " def start(self):\n",
+ " print(\"Initializing federated learning\")\n",
+ " self.collaborators = self.runtime.collaborators\n",
+ " self.current_round = 0\n",
+ " self.next(self.aggregated_model_validation, foreach=\"collaborators\")\n",
+ " \n",
+ " @collaborator\n",
+ " def aggregated_model_validation(self):\n",
+ " print(f\"Validating aggregated model for {self.input}\")\n",
+ " # Load model with quantization\n",
+ " self.model = AutoModelForCausalLM.from_pretrained(\n",
+ " model_name,\n",
+ " quantization_config=bnb_config,\n",
+ " device_map=\"auto\",\n",
+ " trust_remote_code=True\n",
+ " )\n",
+ " self.model = prepare_model_for_kbit_training(self.model)\n",
+ " self.model = get_peft_model(self.model, peft_config)\n",
+ " set_peft_model_state_dict(self.model, self.peft_params)\n",
+ " \n",
+ " # Evaluate\n",
+ " eval_results = trainer.evaluate()\n",
+ " self.agg_validation_score = eval_results[\"eval_loss\"]\n",
+ " print(f\"Validation loss: {self.agg_validation_score}\")\n",
+ " self.next(self.train)\n",
+ " \n",
+ " @collaborator\n",
+ " def train(self):\n",
+ " print(f\"Training on {self.input}\")\n",
+ " # Train with local data\n",
+ " trainer.train()\n",
+ " self.loss = trainer.state.log_history[-1][\"loss\"]\n",
+ " self.next(self.local_model_validation)\n",
+ " \n",
+ " @collaborator\n",
+ " def local_model_validation(self):\n",
+ " print(f\"Validating local model for {self.input}\")\n",
+ " eval_results = trainer.evaluate()\n",
+ " self.local_validation_score = eval_results[\"eval_loss\"]\n",
+ " self.peft_params = get_peft_model_state_dict(self.model)\n",
+ " self.next(self.join, exclude=[\"model\"])\n",
+ " \n",
+ " @aggregator\n",
+ " def join(self, inputs):\n",
+ " self.average_loss = sum(input.loss for input in inputs) / len(inputs)\n",
+ " self.aggregated_model_accuracy = sum(\n",
+ " input.agg_validation_score for input in inputs\n",
+ " ) / len(inputs)\n",
+ " self.local_model_accuracy = sum(\n",
+ " input.local_validation_score for input in inputs\n",
+ " ) / len(inputs)\n",
+ " \n",
+ " print(f\"Round {self.current_round + 1} results:\")\n",
+ " print(f\"Average training loss: {self.average_loss}\")\n",
+ " print(f\"Average validation loss (before training): {self.aggregated_model_accuracy}\")\n",
+ " print(f\"Average validation loss (after training): {self.local_model_accuracy}\")\n",
+ " \n",
+ " # Federated averaging\n",
+ " self.model = FedAvg([input.peft_params for input in inputs], self.model)\n",
+ " self.peft_params = get_peft_model_state_dict(self.model)\n",
+ " \n",
+ " self.current_round += 1\n",
+ " if self.current_round < self.rounds:\n",
+ " self.next(self.aggregated_model_validation, foreach=\"collaborators\")\n",
+ " else:\n",
+ " self.next(self.end)\n",
+ " \n",
+ " @aggregator\n",
+ " def end(self):\n",
+ " print(\"Federated training complete!\")\n",
+ " print(f\"Final model validation loss: {self.aggregated_model_accuracy}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7bc8fe27",
+ "metadata": {},
+ "source": [
+ "## Run Federated Learning"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "38894111-41d9-4dd4-b1c8-eb7ec3cdd3e1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Setup participants\n",
+ "aggregator = Aggregator()\n",
+ "collaborators = [\n",
+ " Collaborator(name=\"Portland\"),\n",
+ " Collaborator(name=\"Seattle\"),\n",
+ " Collaborator(name=\"London\")\n",
+ "]\n",
+ "\n",
+ "# Assign data shards\n",
+ "for idx, colab in enumerate(collaborators):\n",
+ " colab.private_attributes = {\n",
+ " \"train_dataset\": train_dataset.shard(len(collaborators), idx),\n",
+ " \"eval_dataset\": eval_dataset.shard(len(collaborators), idx)\n",
+ " }\n",
+ "\n",
+ "# Create and run workflow\n",
+ "runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators)\n",
+ "flflow = FederatedFlow(model, rounds=3)\n",
+ "flflow.runtime = runtime\n",
+ "flflow.run()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
From 173c1e3b3ceee3a42abe387ca0bbb4693a98b808 Mon Sep 17 00:00:00 2001
From: Rajith
Date: Fri, 16 May 2025 11:04:32 +0530
Subject: [PATCH 26/34] adding phi-4 with 4 bit quantization
---
.../LLM/phi-4-with4bit quantization.ipynb | 1772 +++++++++++++++++
.../experimental/workflow/LLM/phi-4.ipynb | 705 -------
2 files changed, 1772 insertions(+), 705 deletions(-)
create mode 100644 openfl-tutorials/experimental/workflow/LLM/phi-4-with4bit quantization.ipynb
delete mode 100644 openfl-tutorials/experimental/workflow/LLM/phi-4.ipynb
diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-with4bit quantization.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-with4bit quantization.ipynb
new file mode 100644
index 0000000000..e2efa9054b
--- /dev/null
+++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-with4bit quantization.ipynb
@@ -0,0 +1,1772 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "a59f475d-d843-46bc-b75e-10984b687ed3",
+ "metadata": {},
+ "source": [
+ "# Enhanced Federated Fine-Tuning of Phi-4 Using OpenFL with PEFT & Quantization"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "20c74cb9-51a2-42e2-893f-d280e227e8bf",
+ "metadata": {},
+ "source": [
+ "In this tutorial, we demonstrate how to fine-tune Microsoft's Phi-4 model in a federated learning workflow with enhanced local training using:\n",
+ "- Parameter-Efficient Fine-Tuning (PEFT)\n",
+ "- 4-bit Quantization (QLoRA)\n",
+ "- Gradient Checkpointing\n",
+ "- Optimized Training Configuration"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d07c32d3-1a8d-4162-af45-bc3a10e0ae3f",
+ "metadata": {},
+ "source": [
+ "## Installation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "05b2ad75-8c7b-499c-902e-dbd5b24361bc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install torch transformers peft datasets trl==0.12.2 bitsandbytes accelerate -q"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "a7ae1a7e-8c16-4c5a-be57-33d84723aed7",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Thu May 15 13:27:27 2025 \n",
+ "+-----------------------------------------------------------------------------------------+\n",
+ "| NVIDIA-SMI 550.144.03 Driver Version: 550.144.03 CUDA Version: 12.4 |\n",
+ "|-----------------------------------------+------------------------+----------------------+\n",
+ "| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
+ "| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n",
+ "| | | MIG M. |\n",
+ "|=========================================+========================+======================|\n",
+ "| 0 NVIDIA H100 NVL Off | 00000001:00:00.0 Off | 0 |\n",
+ "| N/A 39C P0 62W / 400W | 1MiB / 95830MiB | 0% Default |\n",
+ "| | | Disabled |\n",
+ "+-----------------------------------------+------------------------+----------------------+\n",
+ " \n",
+ "+-----------------------------------------------------------------------------------------+\n",
+ "| Processes: |\n",
+ "| GPU GI CI PID Type Process name GPU Memory |\n",
+ "| ID ID Usage |\n",
+ "|=========================================================================================|\n",
+ "| No running processes found |\n",
+ "+-----------------------------------------------------------------------------------------+\n"
+ ]
+ }
+ ],
+ "source": [
+ "!nvidia-smi"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "440a9c39-ec42-45a5-80f6-9a9e0bc90d2f",
+ "metadata": {},
+ "source": [
+ "## Import Libraries"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "be4690ae-0671-4d3a-8f21-620ab865a03e",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/azureuser/env_name/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+ " from .autonotebook import tqdm as notebook_tqdm\n",
+ "2025-05-15 13:27:30,648\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n"
+ ]
+ }
+ ],
+ "source": [
+ "# System imports\n",
+ "import os\n",
+ "import numpy as np\n",
+ "\n",
+ "# PyTorch imports\n",
+ "import torch\n",
+ "\n",
+ "# Hugging Face Transformers imports for model loading and training\n",
+ "from transformers import (\n",
+ " AutoModelForCausalLM, # For loading large language models\n",
+ " AutoTokenizer, # For tokenizing text inputs\n",
+ " BitsAndBytesConfig, # For 4-bit quantization configuration\n",
+ " TrainingArguments # For configuring training hyperparameters\n",
+ ")\n",
+ "\n",
+ "# PEFT (Parameter-Efficient Fine-Tuning) imports\n",
+ "from peft import (\n",
+ " LoraConfig, # For configuring Low-Rank Adaptation\n",
+ " get_peft_model, # For applying PEFT to a model\n",
+ " prepare_model_for_kbit_training, # For preparing quantized models for training\n",
+ " PeftModel # Base class for PEFT models\n",
+ ")\n",
+ "from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict # For state dict manipulation\n",
+ "\n",
+ "# Dataset and training imports\n",
+ "from datasets import load_dataset\n",
+ "from trl import SFTTrainer # Supervised Fine-Tuning Trainer\n",
+ "\n",
+ "# OpenFL imports for federated learning\n",
+ "from openfl.experimental.workflow.interface import Aggregator, Collaborator, FLSpec\n",
+ "from openfl.experimental.workflow.placement import aggregator, collaborator\n",
+ "from openfl.experimental.workflow.runtime import LocalRuntime"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "06274755",
+ "metadata": {},
+ "source": [
+ "## Acquiring and preprocessing dataset"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a6edefa4",
+ "metadata": {},
+ "source": [
+ "We can download the dataset directly from the [LLM-Adapters repository](https://github.com/AGI-Edgerunners/LLM-Adapters)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "962ac825",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Import libraries needed for downloading and verifying the dataset\n",
+ "import hashlib\n",
+ "import requests\n",
+ "\n",
+ "def file_checksum(file_path, algorithm=\"sha256\"):\n",
+ " \"\"\"\n",
+ " Calculate the checksum of a file using the specified hashing algorithm.\n",
+ " \n",
+ " Args:\n",
+ " file_path (str): The path to the file for which the checksum is to be calculated.\n",
+ " algorithm (str): The hashing algorithm to use (default is 'sha256').\n",
+ " \n",
+ " Returns:\n",
+ " str: The calculated checksum of the file.\n",
+ " \"\"\"\n",
+ " hash_func = hashlib.new(algorithm)\n",
+ " with open(file_path, \"rb\") as f:\n",
+ " for chunk in iter(lambda: f.read(4096), b\"\"):\n",
+ " hash_func.update(chunk)\n",
+ " return hash_func.hexdigest()\n",
+ "\n",
+ "\n",
+ "# Download the dataset if it doesn't exist locally\n",
+ "if not os.path.exists(\"math_10k.json\"):\n",
+ " print(\"Downloading math_10k.json dataset...\")\n",
+ " r = requests.get(\n",
+ " \"https://raw.githubusercontent.com/AGI-Edgerunners/LLM-Adapters/main/ft-training_set/math_10k.json\",\n",
+ " )\n",
+ " with open(\n",
+ " \"math_10k.json\",\n",
+ " \"wb\",\n",
+ " ) as f:\n",
+ " f.write(r.content)\n",
+ " print(\"Download complete.\")\n",
+ "\n",
+ " # Verify the integrity of the downloaded file\n",
+ " actual_checksum = file_checksum(\"math_10k.json\")\n",
+ " expected_checksum = \"0342d0d860ad8592b579329337c90e42eefd3d9f2898043140cbd120630418b8\"\n",
+ " if actual_checksum != expected_checksum:\n",
+ " raise ValueError(\n",
+ " \"Checksum verification failed. The file may have been altered.\"\n",
+ " )\n",
+ " print(\"Checksum verification successful.\")\n",
+ "else:\n",
+ " print(\"Dataset already exists locally.\")\n",
+ "\n",
+ "# Set the dataset path to be used later\n",
+ "dataset_name = \"math_10k.json\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "08576aa0-f628-4ae6-8fc3-dd167d164784",
+ "metadata": {},
+ "source": [
+ "## Configuration"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "eada9809-468a-47c6-9b03-55aa887c9487",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Model and dataset configuration\n",
+ "model_name = \"microsoft/phi-4\" # Pre-trained model identifier from Hugging Face Hub\n",
+ "#dataset_name = \"math_10k.json\" # Dataset file containing mathematical QA pairs\n",
+ "\n",
+ "# QLoRA (Quantized Low-Rank Adaptation) configuration for 4-bit quantization\n",
+ "# This reduces memory footprint while maintaining model quality\n",
+ "bnb_config = BitsAndBytesConfig(\n",
+ " load_in_4bit=True, # Enable 4-bit quantization\n",
+ " bnb_4bit_quant_type=\"nf4\", # Use normalized float 4 format for better precision\n",
+ " bnb_4bit_compute_dtype=torch.bfloat16, # Computation precision\n",
+ " bnb_4bit_use_double_quant=False, # Disable nested quantization for simplicity\n",
+ ")\n",
+ "\n",
+ "# LoRA (Low-Rank Adaptation) configuration for parameter-efficient fine-tuning\n",
+ "# This allows fine-tuning with significantly fewer parameters\n",
+ "peft_config = LoraConfig(\n",
+ " r=8, # Rank of the update matrices (higher = more capacity but more parameters)\n",
+ " lora_alpha=16, # Scaling factor for the trained weights\n",
+ " lora_dropout=0.01, # Dropout probability for LoRA layers\n",
+ " bias=\"none\", # Don't train bias parameters to reduce memory\n",
+ " task_type=\"CAUSAL_LM\", # Specify causal language modeling task\n",
+ " target_modules=\"all-linear\", # Apply LoRA to all linear layers\n",
+ ")\n",
+ "\n",
+ "# Training hyperparameters configuration\n",
+ "training_args = TrainingArguments(\n",
+ " output_dir=\"./results\", # Directory to save checkpoints and logs\n",
+ " num_train_epochs=1, # Number of training epochs\n",
+ " per_device_train_batch_size=2, # Batch size per GPU/TPU core\n",
+ " gradient_accumulation_steps=2, # Number of updates steps to accumulate before backward pass\n",
+ " optim=\"adamw_torch_fused\", # Optimizer to use (fused for better performance)\n",
+ " save_steps=100, # Save checkpoint every X updates steps\n",
+ " logging_steps=10, # Log metrics every X updates steps\n",
+ " learning_rate=3e-4, # Initial learning rate\n",
+ " weight_decay=0.001, # Weight decay regularization\n",
+ " fp16=False, # Disable FP16 training (using BF16 instead)\n",
+ " bf16=True, # Enable BF16 training (better numerical stability than FP16)\n",
+ " max_grad_norm=0.5, # Max gradient norm for gradient clipping\n",
+ " warmup_ratio=0.02, # Portion of steps for learning rate warmup\n",
+ " lr_scheduler_type=\"cosine\", # Learning rate scheduler type\n",
+ " gradient_checkpointing=True, # Enable gradient checkpointing to save memory\n",
+ " report_to=\"none\" # Disable reporting to tracking platforms\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ffe93234-2a1a-4809-a431-efe2f35ce496",
+ "metadata": {},
+ "source": [
+ "## Load and Prepare Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "8ab371f1-64c3-4225-82e7-fb3c5b05578c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.36it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "trainable params: 27,852,800 || all params: 14,687,360,000 || trainable%: 0.1896\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Load tokenizer\n",
+ "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
+ "tokenizer.pad_token = tokenizer.eos_token\n",
+ "tokenizer.padding_side = \"right\"\n",
+ "\n",
+ "# Load model with quantization\n",
+ "model = AutoModelForCausalLM.from_pretrained(\n",
+ " model_name,\n",
+ " quantization_config=bnb_config,\n",
+ " device_map=\"auto\",\n",
+ " trust_remote_code=True\n",
+ ")\n",
+ "\n",
+ "# Prepare model for k-bit training\n",
+ "model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)\n",
+ "\n",
+ "# Apply LoRA\n",
+ "model = get_peft_model(model, peft_config)\n",
+ "model.print_trainable_parameters()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dd058fff-f6dd-4cc6-acaf-7e2fa2c1132d",
+ "metadata": {},
+ "source": [
+ "## Load and Prepare Dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4392ddab-10b7-41f6-a8e0-65ba298ea457",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def format_prompt(example):\n",
+ " \"\"\"\n",
+ " Format a dataset example into a standardized prompt-response format for instruction tuning.\n",
+ " \n",
+ " This function converts raw dataset examples into a structured format suitable for\n",
+ " instruction fine-tuning of large language models. The format follows the common\n",
+ " pattern used for instruction-following tasks with clear section demarcation.\n",
+ " \n",
+ " Args:\n",
+ " example (dict): A dictionary containing the example data with keys:\n",
+ " - 'instruction': The task instruction\n",
+ " - 'input': The optional input context (may be empty)\n",
+ " - 'output': The expected output/response\n",
+ " \n",
+ " Returns:\n",
+ " str: A formatted prompt string with instruction, optional input, and response\n",
+ " \"\"\"\n",
+ " if example[\"input\"]:\n",
+ " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n",
+ "\n",
+ "### Instruction:\n",
+ "{example['instruction']}\n",
+ "\n",
+ "### Input:\n",
+ "{example['input']}\n",
+ "\n",
+ "### Response:\n",
+ "{example['output']}\"\"\"\n",
+ " else:\n",
+ " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n",
+ "\n",
+ "### Instruction:\n",
+ "{example['instruction']}\n",
+ "\n",
+ "### Response:\n",
+ "{example['output']}\"\"\"\n",
+ "\n",
+ "# Load dataset from JSON file (contains mathematical question-answer pairs)\n",
+ "dataset = load_dataset(\"json\", data_files=dataset_name, split=\"train\", num_proc=4)\n",
+ "\n",
+ "# Transform raw examples into formatted text for instruction tuning\n",
+ "dataset = dataset.map(lambda x: {\"text\": format_prompt(x)}, num_proc=4)\n",
+ "\n",
+ "# Split dataset into training (90%) and evaluation (10%) sets\n",
+ "dataset = dataset.train_test_split(test_size=0.1)\n",
+ "train_dataset = dataset[\"train\"]\n",
+ "eval_dataset = dataset[\"test\"]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "812cfcc8-33ec-4a2b-8a74-27bfc2a41d7b",
+ "metadata": {},
+ "source": [
+ "## Enhanced Training with SFTTrainer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "6dc85c57-68b2-4514-9373-43e3d7c05c10",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': dataset_text_field, max_seq_length, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "Generating train split: 1820 examples [00:02, 613.71 examples/s]\n",
+ "Generating train split: 209 examples [00:00, 582.95 examples/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "trainer = SFTTrainer(\n",
+ " model=model,\n",
+ " train_dataset=train_dataset,\n",
+ " eval_dataset=eval_dataset,\n",
+ " peft_config=peft_config,\n",
+ " dataset_text_field=\"text\",\n",
+ " max_seq_length=1024,\n",
+ " tokenizer=tokenizer,\n",
+ " args=training_args,\n",
+ " packing=True,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "810eb75e",
+ "metadata": {},
+ "source": [
+ "## Federated Averaging Function"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "58298e8e-ab9e-4377-966e-143823441697",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def FedAvg(peft_params, model, weights=None):\n",
+ " \"\"\"\n",
+ " Perform Federated Averaging (FedAvg) on the model parameters.\n",
+ " \n",
+ " This function aggregates PEFT parameters from multiple collaborators using weighted\n",
+ " averaging. It handles the complex task of averaging parameters while maintaining \n",
+ " the correct tensor types and shapes required by the PEFT framework.\n",
+ " \n",
+ " Args:\n",
+ " peft_params (list): A list of state dictionaries containing PEFT parameters from different collaborators.\n",
+ " model (torch.nn.Module): The base model to which the averaged parameters will be applied.\n",
+ " weights (list, optional): A list of weights for averaging the parameters. If None, equal weights are used.\n",
+ " Weights determine the contribution of each collaborator to the final model.\n",
+ " \n",
+ " Returns:\n",
+ " torch.nn.Module: The model with the averaged parameters applied.\n",
+ " \n",
+ " Notes:\n",
+ " The function converts tensors to float for averaging to avoid precision issues,\n",
+ " then converts back to the original data type for model compatibility.\n",
+ " \"\"\"\n",
+ " # Store the state dictionaries for easy access\n",
+ " state_dicts = peft_params\n",
+ " # Get the current state dict from the model as a template\n",
+ " state_dict = get_peft_model_state_dict(model)\n",
+ " \n",
+ " # Iterate through each parameter in the first state dict as reference\n",
+ " for key in peft_params[0]:\n",
+ " # Store original data type for later conversion\n",
+ " dtype = state_dicts[0][key].dtype\n",
+ " \n",
+ " # Convert all tensors to float, move to CPU, perform weighted average\n",
+ " state_dict[key] = torch.from_numpy(\n",
+ " np.average(\n",
+ " [state[key].cpu().to(torch.float).numpy() for state in state_dicts], \n",
+ " axis=0, \n",
+ " weights=weights\n",
+ " )\n",
+ " ).to(dtype) # Convert back to original data type\n",
+ " \n",
+ " # Apply the averaged parameters back to the model\n",
+ " set_peft_model_state_dict(model, state_dict)\n",
+ " return model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e120a656-f4a5-47a5-a3d4-62c5f3672bba",
+ "metadata": {},
+ "source": [
+ "## Federated Learning Workflow"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e5e108c6-5150-4931-9c01-6b64a913fa04",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Aggregator step \"start\" registered\n",
+ "Collaborator step \"aggregated_model_validation\" registered\n",
+ "Collaborator step \"train\" registered\n",
+ "Collaborator step \"local_model_validation\" registered\n",
+ "Aggregator step \"join\" registered\n",
+ "Aggregator step \"end\" registered\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Import the required PrinterCallback for proper initialization/removal\n",
+ "from transformers.trainer_callback import PrinterCallback\n",
+ "import transformers\n",
+ "\n",
+ "class FederatedFlow(FLSpec):\n",
+ " \"\"\"\n",
+ " Federated Learning workflow for fine-tuning Phi-4 model with PEFT and quantization.\n",
+ " \n",
+ " This class implements the complete federated learning workflow for a language model,\n",
+ " including initialization, aggregated model validation, training, local model validation,\n",
+ " and parameter aggregation. It uses Parameter-Efficient Fine-Tuning (PEFT) with 4-bit\n",
+ " quantization to efficiently train large language models in memory-constrained environments.\n",
+ " \n",
+ " The workflow follows these steps for each round:\n",
+ " 1. Initialize model on each collaborator\n",
+ " 2. Validate the aggregated model on local data\n",
+ " 3. Train the model locally on each collaborator\n",
+ " 4. Validate the locally trained model\n",
+ " 5. Aggregate PEFT parameters from all collaborators using FedAvg\n",
+ " 6. Repeat for specified number of rounds\n",
+ " \n",
+ " Attributes:\n",
+ " model: The base language model being fine-tuned\n",
+ " peft_params: PEFT parameters dictionary for the model\n",
+ " optimizer: Optimizer for training (optional)\n",
+ " rounds: Number of federated learning rounds to perform\n",
+ " current_round: Counter for the current round\n",
+ " collaborators: List of collaborators participating in federated learning\n",
+ " \"\"\"\n",
+ " def __init__(self, model=None, optimizer=None, rounds=3, **kwargs):\n",
+ " \"\"\"\n",
+ " Initialize the federated learning workflow.\n",
+ " \n",
+ " Args:\n",
+ " model: The base language model to fine-tune. Must be provided.\n",
+ " optimizer: Optional optimizer for model training.\n",
+ " rounds: Number of federated learning rounds to perform (default: 3).\n",
+ " **kwargs: Additional arguments passed to the parent class.\n",
+ " \n",
+ " Raises:\n",
+ " ValueError: If no model is provided.\n",
+ " \"\"\"\n",
+ " super().__init__(**kwargs)\n",
+ " if model is not None:\n",
+ " self.model = model\n",
+ " self.peft_params = get_peft_model_state_dict(self.model)\n",
+ " self.optimizer = optimizer\n",
+ " else:\n",
+ " raise ValueError(\"No model inputted\")\n",
+ "\n",
+ " self.rounds = rounds\n",
+ " \n",
+ "\n",
+ " @aggregator\n",
+ " def start(self):\n",
+ " \"\"\"\n",
+ " Start the federated learning process on the aggregator.\n",
+ " \n",
+ " This method initializes the workflow by:\n",
+ " 1. Setting up the list of collaborators from the runtime\n",
+ " 2. Initializing the current round counter\n",
+ " 3. Starting the first step of the workflow by sending the model\n",
+ " to all collaborators for validation\n",
+ " \n",
+ " The @aggregator decorator ensures this method runs on the aggregator node.\n",
+ " \"\"\"\n",
+ " print(f\"Performing initialization for model\")\n",
+ " self.collaborators = self.runtime.collaborators\n",
+ " self.current_round = 0\n",
+ " # Start the workflow by sending the model to all collaborators\n",
+ " self.next(\n",
+ " self.aggregated_model_validation,\n",
+ " foreach=\"collaborators\",\n",
+ " )\n",
+ "\n",
+ " \n",
+ " @collaborator\n",
+ " def aggregated_model_validation(self):\n",
+ " \"\"\"\n",
+ " Validate the aggregated model on each collaborator's local dataset.\n",
+ " \n",
+ " This method:\n",
+ " 1. Loads the model with appropriate quantization configuration\n",
+ " 2. Applies the PEFT configuration and parameters\n",
+ " 3. Creates a trainer with local validation dataset\n",
+ " 4. Evaluates the model and records the validation loss\n",
+ " 5. Transitions to the training phase\n",
+ " \n",
+ " The @collaborator decorator ensures this method runs on each collaborator node.\n",
+ " \n",
+ " Notes:\n",
+ " Includes fallback to CPU if GPU memory is insufficient\n",
+ " \"\"\"\n",
+ " print(f\"Performing aggregated model validation for collaborator {self.input}\")\n",
+ " # Load model with quantization and CPU offloading if needed\n",
+ " device_map = \"auto\" \n",
+ " try:\n",
+ " # Try to load model on GPU with quantization\n",
+ " self.model = AutoModelForCausalLM.from_pretrained(\n",
+ " model_name,\n",
+ " quantization_config=bnb_config,\n",
+ " device_map=device_map,\n",
+ " #max_memory={0: \"4GiB\", \"cpu\": \"24GiB\"},\n",
+ " trust_remote_code=True\n",
+ " )\n",
+ " except ValueError:\n",
+ " # Fallback to CPU if GPU memory is insufficient\n",
+ " print(f\"Falling back to CPU mode for {self.input}\")\n",
+ " self.model = AutoModelForCausalLM.from_pretrained(\n",
+ " model_name,\n",
+ " device_map=\"cpu\",\n",
+ " trust_remote_code=True\n",
+ " )\n",
+ " \n",
+ " # Prepare model for training with quantization\n",
+ " self.model = prepare_model_for_kbit_training(self.model)\n",
+ " # Apply PEFT configuration (LoRA)\n",
+ " self.model = get_peft_model(self.model, peft_config)\n",
+ " # Load aggregated parameters\n",
+ " set_peft_model_state_dict(self.model, self.peft_params)\n",
+ " \n",
+ " # Setup trainer for evaluation\n",
+ " trainer = SFTTrainer(\n",
+ " model=self.model,\n",
+ " args=training_args,\n",
+ " peft_config=peft_config,\n",
+ " train_dataset=self.train_dataset,\n",
+ " eval_dataset=self.eval_dataset,\n",
+ " max_seq_length=1024,\n",
+ " dataset_text_field=\"text\",\n",
+ " tokenizer=tokenizer,\n",
+ " packing=True,\n",
+ " data_collator=transformers.DataCollatorForSeq2Seq(\n",
+ " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n",
+ " ),\n",
+ " )\n",
+ "\n",
+ " # Remove default printer callback to avoid verbose output\n",
+ " trainer.remove_callback(PrinterCallback)\n",
+ " # Evaluate model and store metrics\n",
+ " out = trainer.evaluate()\n",
+ " self.agg_validation_score = out[\"eval_loss\"]\n",
+ " print(f\"{self.input} value of {self.agg_validation_score}\")\n",
+ " # Move to training phase\n",
+ " self.next(self.train)\n",
+ "\n",
+ " @collaborator\n",
+ " def train(self):\n",
+ " \"\"\"\n",
+ " Train the model on each collaborator's local dataset.\n",
+ " \n",
+ " This method:\n",
+ " 1. Creates an SFTTrainer with the local training dataset\n",
+ " 2. Runs the training process\n",
+ " 3. Records the training loss\n",
+ " 4. Saves the trained model\n",
+ " 5. Transitions to local validation phase\n",
+ " \n",
+ " The @collaborator decorator ensures this method runs on each collaborator node.\n",
+ " \"\"\"\n",
+ " # Setup trainer for local training\n",
+ " trainer = SFTTrainer(\n",
+ " model=self.model,\n",
+ " args=training_args,\n",
+ " peft_config=peft_config,\n",
+ " train_dataset=self.train_dataset,\n",
+ " eval_dataset=self.eval_dataset,\n",
+ " max_seq_length=1024,\n",
+ " dataset_text_field=\"text\",\n",
+ " tokenizer=tokenizer,\n",
+ " packing=True,\n",
+ " data_collator=transformers.DataCollatorForSeq2Seq(\n",
+ " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n",
+ " ),\n",
+ " )\n",
+ "\n",
+ " # Execute training\n",
+ " out = trainer.train()\n",
+ " # Store training loss for later analysis\n",
+ " self.loss = out.training_loss\n",
+ " # Save locally trained model\n",
+ " trainer.save_model()\n",
+ " self.training_completed = True\n",
+ " # Move to local validation phase\n",
+ " self.next(self.local_model_validation)\n",
+ "\n",
+ " @collaborator\n",
+ " def local_model_validation(self):\n",
+ " \"\"\"\n",
+ " Validate the locally trained model on each collaborator's validation dataset.\n",
+ " \n",
+ " This method:\n",
+ " 1. Creates an SFTTrainer with the local validation dataset\n",
+ " 2. Evaluates the locally trained model\n",
+ " 3. Records the validation loss\n",
+ " 4. Extracts the PEFT parameters for aggregation\n",
+ " 5. Sends results to the aggregator for parameter aggregation\n",
+ " \n",
+ " The @collaborator decorator ensures this method runs on each collaborator node.\n",
+ " \n",
+ " Notes:\n",
+ " Excludes the full model and training flags from the data sent to the aggregator\n",
+ " to reduce communication overhead\n",
+ " \"\"\"\n",
+ " # Setup trainer for evaluation\n",
+ " trainer = SFTTrainer(\n",
+ " model=self.model,\n",
+ " args=training_args,\n",
+ " peft_config=peft_config,\n",
+ " train_dataset=self.train_dataset,\n",
+ " eval_dataset=self.eval_dataset,\n",
+ " max_seq_length=1024,\n",
+ " dataset_text_field=\"text\",\n",
+ " tokenizer=tokenizer,\n",
+ " packing=True,\n",
+ " data_collator=transformers.DataCollatorForSeq2Seq(\n",
+ " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n",
+ " ),\n",
+ " )\n",
+ " # Evaluate the locally trained model\n",
+ " out = trainer.evaluate()\n",
+ " self.local_validation_score = out[\"eval_loss\"]\n",
+ " # Extract PEFT parameters for aggregation\n",
+ " self.peft_params = get_peft_model_state_dict(self.model)\n",
+ " print(f\"Doing local model validation for collaborator {self.input}\")\n",
+ " # Send results to aggregator, excluding the full model and training flags\n",
+ " # to reduce communication overhead\n",
+ " self.next(self.join, exclude=[\"training_completed\", \"model\"])\n",
+ "\n",
+ " @aggregator\n",
+ " def join(self, inputs):\n",
+ " \"\"\"\n",
+ " Aggregate results from all collaborators and update the global model.\n",
+ " \n",
+ " This method:\n",
+ " 1. Calculates average loss, aggregated model accuracy, and local model accuracy\n",
+ " 2. Updates the global model using Federated Averaging (FedAvg)\n",
+ " 3. Saves the aggregated model and tokenizer\n",
+ " 4. Either starts the next round or ends the workflow depending on round count\n",
+ " \n",
+ " Args:\n",
+ " inputs: List of data objects from all collaborators containing validation scores\n",
+ " and PEFT parameters.\n",
+ " \n",
+ " The @aggregator decorator ensures this method runs on the aggregator node.\n",
+ " \"\"\"\n",
+ " # Calculate average metrics across all collaborators\n",
+ " self.average_loss = sum(input.loss for input in inputs) / len(inputs)\n",
+ " self.aggregated_model_accuracy = sum(\n",
+ " input.agg_validation_score for input in inputs\n",
+ " ) / len(inputs)\n",
+ " self.local_model_accuracy = sum(\n",
+ " input.local_validation_score for input in inputs\n",
+ " ) / len(inputs)\n",
+ " \n",
+ " # Display aggregated metrics\n",
+ " print(\n",
+ " f\"Average aggregated model validation values = {self.aggregated_model_accuracy}\"\n",
+ " )\n",
+ " print(f\"Average training loss = {self.average_loss}\")\n",
+ " print(f\"Average local model validation values = {self.local_model_accuracy}\")\n",
+ "\n",
+ " # Perform federated averaging of model parameters\n",
+ " self.model = FedAvg([input.peft_params for input in inputs], self.model)\n",
+ " self.peft_params = get_peft_model_state_dict(self.model)\n",
+ "\n",
+ " # Save the aggregated model for future use\n",
+ " self.model.save_pretrained(\"./aggregated/model\")\n",
+ " tokenizer.save_pretrained(\"./aggregated/tokenizer\")\n",
+ " \n",
+ " # Increment round counter and start next round or end workflow\n",
+ " self.current_round += 1\n",
+ " if self.current_round < self.rounds:\n",
+ " self.next(\n",
+ " self.aggregated_model_validation,\n",
+ " foreach=\"collaborators\",\n",
+ " exclude=[\"model\"],\n",
+ " )\n",
+ " else:\n",
+ " self.next(self.end)\n",
+ "\n",
+ " @aggregator\n",
+ " def end(self):\n",
+ " \"\"\"\n",
+ " End the federated learning process.\n",
+ " \n",
+ " This method marks the end of the federated learning workflow after all rounds\n",
+ " have been completed. The final aggregated model and tokenizer are already saved\n",
+ " in the last join step.\n",
+ " \n",
+ " The @aggregator decorator ensures this method runs on the aggregator node.\n",
+ " \"\"\"\n",
+ " print(f\"This is the end of the flow\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7bc8fe27",
+ "metadata": {},
+ "source": [
+ "## Run Federated Learning"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "38894111-41d9-4dd4-b1c8-eb7ec3cdd3e1",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling start\n",
+ "\u001b[94mPerforming initialization for model\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94mPerforming aggregated model validation for collaborator Portland\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.30it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "Generating train split: 913 examples [00:01, 623.08 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n",
+ "Generating train split: 104 examples [00:00, 583.62 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:12]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mPortland value of 0.5918120741844177\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "Generating train split: 913 examples [00:01, 616.37 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n",
+ "Generating train split: 104 examples [00:00, 615.85 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n",
+ "\u001b[0m`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94mPerforming aggregated model validation for collaborator Seattle\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.32it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "Generating train split: 907 examples [00:01, 626.09 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n",
+ "Generating train split: 105 examples [00:00, 634.41 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [14/14 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mSeattle value of 0.589488685131073\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [14/14 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling join\n",
+ "\u001b[94mAverage aggregated model validation values = 0.5906503796577454\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mAverage training loss = 0.3295206361469617\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mAverage local model validation values = 0.3146952837705612\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94mPerforming aggregated model validation for collaborator Portland\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.33it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mPortland value of 0.31504756212234497\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94mPerforming aggregated model validation for collaborator Seattle\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.30it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [14/14 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mSeattle value of 0.31057578325271606\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Setup federated learning participants\n",
+ "aggregator = Aggregator() # Central coordinator that aggregates model updates\n",
+ "collaborators = [\n",
+ " Collaborator(name=\"Portland\"), # First participant with local dataset\n",
+ " Collaborator(name=\"Seattle\") # Second participant with local dataset\n",
+ "]\n",
+ "\n",
+ "# Distribute data shards to collaborators (simulating data silos)\n",
+ "# Each collaborator gets a non-overlapping portion of the dataset\n",
+ "for idx, colab in enumerate(collaborators):\n",
+ " colab.private_attributes = {\n",
+ " \"train_dataset\": train_dataset.shard(len(collaborators), idx), # Training shard\n",
+ " \"eval_dataset\": eval_dataset.shard(len(collaborators), idx) # Evaluation shard\n",
+ " }\n",
+ "\n",
+ "# Set up and execute the federated learning workflow\n",
+ "runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators) # Local simulation runtime\n",
+ "flflow = FederatedFlow(model, rounds=2) # Create flow with 2 federated learning rounds\n",
+ "flflow.runtime = runtime # Assign runtime to the flow\n",
+ "flflow.run() # Start the federated learning process"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python (myenv)",
+ "language": "python",
+ "name": "myenv"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4.ipynb
deleted file mode 100644
index 0c6884f384..0000000000
--- a/openfl-tutorials/experimental/workflow/LLM/phi-4.ipynb
+++ /dev/null
@@ -1,705 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "a59f475d-d843-46bc-b75e-10984b687ed3",
- "metadata": {},
- "source": [
- "# Federated Fine-Tuning of Phi-4 Using OpenFL"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "20c74cb9-51a2-42e2-893f-d280e227e8bf",
- "metadata": {},
- "source": [
- "\n",
- "In this tutorial, we demonstrate how to fine-tune Microsoft's Phi-4 model in a federated learning workflow.\n",
- "\n",
- "We will fine-tune **Microsoft's [Phi4](https://huggingface.co/microsoft/phi-4)** model using a diverse dataset such as [Math_10k](https://github.com/AGI-Edgerunners/LLM-Adapters/tree/main), an open-source dataset containing mathematical question-answer pairs collected from various smaller math datasets."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "d07c32d3-1a8d-4162-af45-bc3a10e0ae3f",
- "metadata": {},
- "source": [
- "## The Workflow Interface"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "e3d74610-e48d-4dd4-b622-eb910fbe91aa",
- "metadata": {},
- "source": [
- "The workflow interface is an innovative approach to designing federated learning experiments with OpenFL. It was developed in response to discussions with researchers and users who had unique use cases that didn’t perfectly align with the traditional horizontal federated learning model. This interface enables more flexible compositions of experiments, allowing for greater customization and adaptability in complex, real-world scenarios"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "413e1d95-fd76-4fe0-b8d0-4c625c2a8fd3",
- "metadata": {},
- "source": [
- "## Installing OpenFL\n",
- "To install OpenFL, follow the official documentation: \n",
- "[OpenFL Installation Guide](https://openfl.readthedocs.io/en/latest/installation.html)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "53654c70",
- "metadata": {},
- "source": [
- "After installation, activate experimental APIs using: \n",
- "`fx experimental activate`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "05b2ad75-8c7b-499c-902e-dbd5b24361bc",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Install dependencies \n",
- "!pip install torch transformers peft datasets trl==0.12.2 -q"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "440a9c39-ec42-45a5-80f6-9a9e0bc90d2f",
- "metadata": {},
- "source": [
- "## Import libraries"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "be4690ae-0671-4d3a-8f21-620ab865a03e",
- "metadata": {},
- "outputs": [],
- "source": [
- "import hashlib\n",
- "import os\n",
- "\n",
- "import numpy as np\n",
- "import requests\n",
- "import torch\n",
- "import transformers\n",
- "from datasets import load_dataset\n",
- "from peft import LoraConfig, get_peft_model\n",
- "from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict\n",
- "from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments\n",
- "from transformers.trainer_callback import PrinterCallback\n",
- "from trl import SFTTrainer\n",
- "\n",
- "from openfl.experimental.workflow.interface import Aggregator, Collaborator, FLSpec\n",
- "from openfl.experimental.workflow.placement import aggregator, collaborator\n",
- "from openfl.experimental.workflow.runtime import LocalRuntime"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "08576aa0-f628-4ae6-8fc3-dd167d164784",
- "metadata": {},
- "source": [
- "## Acquiring and preprocessing dataset"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7ba1d8b6-8a5b-41a2-8c77-c9a85e869cda",
- "metadata": {},
- "source": [
- "We can download the dataset directly from the [LLM-Adapters repository](https://github.com/AGI-Edgerunners/LLM-Adapters)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "d615d626-8727-4169-b2a6-3ba15c3cdb95",
- "metadata": {},
- "outputs": [],
- "source": [
- "def file_checksum(file_path, algorithm=\"sha256\"):\n",
- " \"\"\"\n",
- " Calculate the checksum of a file using the specified hashing algorithm.\n",
- "\n",
- " Parameters:\n",
- " file_path (str): The path to the file for which the checksum is to be calculated.\n",
- " algorithm (str): The hashing algorithm to use (default is 'sha256').\n",
- "\n",
- " Returns:\n",
- " str: The calculated checksum of the file.\n",
- " \"\"\"\n",
- " hash_func = hashlib.new(algorithm)\n",
- " with open(file_path, \"rb\") as f:\n",
- " for chunk in iter(lambda: f.read(4096), b\"\"):\n",
- " hash_func.update(chunk)\n",
- " return hash_func.hexdigest()\n",
- "\n",
- "\n",
- "if not os.path.exists(\"math_10k.json\"):\n",
- " r = requests.get(\n",
- " \"https://raw.githubusercontent.com/AGI-Edgerunners/LLM-Adapters/main/ft-training_set/math_10k.json\",\n",
- " )\n",
- " with open(\n",
- " \"math_10k.json\",\n",
- " \"wb\",\n",
- " ) as f:\n",
- " f.write(r.content)\n",
- "\n",
- " actual_checksum = file_checksum(\"math_10k.json\")\n",
- " if (\n",
- " actual_checksum\n",
- " != \"0342d0d860ad8592b579329337c90e42eefd3d9f2898043140cbd120630418b8\"\n",
- " ):\n",
- " raise ValueError(\n",
- " \"Checksum verification failed. The file may have been altered.\"\n",
- " )\n",
- "\n",
- "raw_dataset = load_dataset(\"json\", data_files=\"math_10k.json\")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "3ab15ad6-db35-4a58-a2d5-54a6d3ccdc78",
- "metadata": {},
- "source": [
- "## Initialize arguments and configurations"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "eada9809-468a-47c6-9b03-55aa887c9487",
- "metadata": {},
- "outputs": [],
- "source": [
- "training_config = {\n",
- " \"bf16\": True,\n",
- " \"use_cpu\": True,\n",
- " \"do_eval\": False,\n",
- " \"learning_rate\": 5.0e-06,\n",
- " \"log_level\": \"info\",\n",
- " \"logging_steps\": 20,\n",
- " \"lr_scheduler_type\": \"cosine\",\n",
- " \"num_train_epochs\": 1,\n",
- " \"output_dir\": \"./checkpoint_dir\",\n",
- " \"overwrite_output_dir\": True,\n",
- " \"per_device_eval_batch_size\": 1,\n",
- " \"per_device_train_batch_size\": 1,\n",
- " \"save_steps\": 100,\n",
- " \"save_total_limit\": 1,\n",
- " \"seed\": 0,\n",
- " \"gradient_checkpointing\": True,\n",
- " \"gradient_checkpointing_kwargs\": {\"use_reentrant\": False},\n",
- " \"warmup_ratio\": 0.2,\n",
- "}\n",
- "\n",
- "peft_config = {\n",
- " \"r\": 1,\n",
- " \"lora_alpha\": 2,\n",
- " \"lora_dropout\": 0.05,\n",
- " \"bias\": \"none\",\n",
- " \"task_type\": \"CAUSAL_LM\",\n",
- " \"target_modules\": \"all-linear\",\n",
- " \"modules_to_save\": None,\n",
- "}\n",
- "model_kwargs = dict(\n",
- " use_cache=False,\n",
- " trust_remote_code=True,\n",
- " torch_dtype=torch.bfloat16,\n",
- " device_map=None,\n",
- ")\n",
- "train_conf = TrainingArguments(**training_config)\n",
- "peft_conf = LoraConfig(**peft_config)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "ffe93234-2a1a-4809-a431-efe2f35ce496",
- "metadata": {},
- "source": [
- "## Load and initialize model"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "8ab371f1-64c3-4225-82e7-fb3c5b05578c",
- "metadata": {},
- "outputs": [],
- "source": [
- "checkpoint_path = \"NyxKrage/Microsoft_Phi-4\"\n",
- "model = AutoModelForCausalLM.from_pretrained(\n",
- " checkpoint_path, return_dict=True, **model_kwargs\n",
- ")\n",
- "model = get_peft_model(model, peft_conf)\n",
- "\n",
- "tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)\n",
- "sequence_max_length = 512\n",
- "val_set_size = 2000\n",
- "tokenizer.pad_token_id = 0 # we want this to be different from the eos token\n",
- "tokenizer.padding_side = \"left\" # Allow batched inference"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "dd058fff-f6dd-4cc6-acaf-7e2fa2c1132d",
- "metadata": {},
- "source": [
- "## Preprocess dataset"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "4392ddab-10b7-41f6-a8e0-65ba298ea457",
- "metadata": {},
- "outputs": [],
- "source": [
- "def generate_prompt(data_point):\n",
- " \"\"\"\n",
- " Generate a prompt based on the given data point.\n",
- "\n",
- " Parameters:\n",
- " data_point (dict): A dictionary containing the instruction, input, and output.\n",
- "\n",
- " Returns:\n",
- " str: The generated prompt as a string.\n",
- " \"\"\"\n",
- " if data_point[\"input\"]:\n",
- " return f\"\"\"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. \n",
- "\n",
- " ### Instruction:\n",
- " {data_point[\"instruction\"]}\n",
- " \n",
- " ### Input:\n",
- " {data_point[\"input\"]}\n",
- " \n",
- " ### Response:\n",
- " {data_point[\"output\"]}\"\"\"\n",
- " else:\n",
- " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request. \n",
- "\n",
- " ### Instruction:\n",
- " {data_point[\"instruction\"]}\n",
- " \n",
- " ### Response:\n",
- " {data_point[\"output\"]}\"\"\"\n",
- "\n",
- "\n",
- "def tokenize(prompt, add_eos_token=True):\n",
- " \"\"\"\n",
- " Tokenize the given prompt.\n",
- "\n",
- " Parameters:\n",
- " prompt (str): The prompt to be tokenized.\n",
- " add_eos_token (bool): Whether to add an end-of-sequence token (default is True).\n",
- "\n",
- " Returns:\n",
- " dict: A dictionary containing the tokenized input IDs and attention mask.\n",
- " \"\"\"\n",
- " result = tokenizer(\n",
- " prompt,\n",
- " truncation=True,\n",
- " max_length=sequence_max_length,\n",
- " padding=False,\n",
- " return_tensors=None,\n",
- " )\n",
- " if (\n",
- " result[\"input_ids\"][-1] != tokenizer.eos_token_id\n",
- " and len(result[\"input_ids\"]) < sequence_max_length\n",
- " and add_eos_token\n",
- " ):\n",
- " result[\"input_ids\"].append(tokenizer.eos_token_id)\n",
- " result[\"attention_mask\"].append(1)\n",
- "\n",
- " result[\"labels\"] = result[\"input_ids\"].copy()\n",
- "\n",
- " return result\n",
- "\n",
- "\n",
- "def generate_and_tokenize_prompt(data_point):\n",
- " \"\"\"\n",
- " Generate and tokenize a prompt based on the given data point.\n",
- "\n",
- " Parameters:\n",
- " data_point (dict): A dictionary containing the instruction, input, and output.\n",
- "\n",
- " Returns:\n",
- " dict: A dictionary containing the tokenized input IDs, attention mask, and labels.\n",
- " \"\"\"\n",
- " full_prompt = generate_prompt(data_point)\n",
- " tokenized_full_prompt = tokenize(full_prompt)\n",
- " user_prompt = generate_prompt({**data_point, \"output\": \"\"})\n",
- " tokenized_user_prompt = tokenize(user_prompt, add_eos_token=False)\n",
- " user_prompt_len = len(tokenized_user_prompt[\"input_ids\"])\n",
- "\n",
- " tokenized_full_prompt[\"labels\"] = [-100] * user_prompt_len + tokenized_full_prompt[\n",
- " \"labels\"\n",
- " ][user_prompt_len:]\n",
- " return tokenized_full_prompt\n",
- "\n",
- "\n",
- "train_val = raw_dataset[\"train\"].train_test_split(\n",
- " test_size=val_set_size, shuffle=True, seed=42\n",
- ")\n",
- "\n",
- "processed_train_dataset = train_val[\"train\"].shuffle().map(generate_and_tokenize_prompt).select(range(3))\n",
- "processed_test_dataset = train_val[\"test\"].shuffle().map(generate_and_tokenize_prompt).select(range(3))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "812cfcc8-33ec-4a2b-8a74-27bfc2a41d7b",
- "metadata": {},
- "source": [
- "## Define Federated Averaging Method\n",
- "The FedAvg method is used to average the models from all the collaborators after training."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "6dc85c57-68b2-4514-9373-43e3d7c05c10",
- "metadata": {},
- "outputs": [],
- "source": [
- "def FedAvg(peft_params, model, weights=None):\n",
- " \"\"\"\n",
- " Perform Federated Averaging (FedAvg) on the model parameters.\n",
- "\n",
- " Parameters:\n",
- " peft_params (list): A list of state dictionaries containing the model parameters from different clients.\n",
- " model (torch.nn.Module): The model to which the averaged parameters will be applied.\n",
- " weights (list, optional): A list of weights for averaging the parameters. If None, equal weights are used.\n",
- "\n",
- " Returns:\n",
- " torch.nn.Module: The model with the averaged parameters applied.\n",
- " \"\"\"\n",
- " state_dicts = peft_params\n",
- " state_dict = get_peft_model_state_dict(model)\n",
- " for key in peft_params[0]:\n",
- " dtype = state_dicts[0][key].dtype\n",
- " state_dict[key] = torch.from_numpy(\n",
- " np.average(\n",
- " [state[key].to(torch.float).numpy() for state in state_dicts], axis=0, weights=weights\n",
- " )\n",
- " ).to(dtype)\n",
- " set_peft_model_state_dict(model, state_dict)\n",
- " return model"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "810eb75e",
- "metadata": {},
- "source": [
- "Now we come to the flow definition. The OpenFL Workflow Interface adopts the conventions set by Metaflow, that every workflow begins with `start` and concludes with the `end` task. The aggregator begins with an optionally passed in model and optimizer. The aggregator begins the flow with the `start` task, where the list of collaborators is extracted from the runtime (`self.collaborators = self.runtime.collaborators`) and is then used as the list of participants to run the task listed in `self.next`, `aggregated_model_validation`. The model, optimizer, and anything that is not explicitly excluded from the next function will be passed from the `start` function on the aggregator to the `aggregated_model_validation` task on the collaborator. Where the tasks run is determined by the placement decorator that precedes each task definition (`@aggregator` or `@collaborator`). Once each of the collaborators (defined in the runtime) complete the `aggregated_model_validation` task, they pass their current state onto the `train` task, from `train` to `local_model_validation`, and then finally to `join` at the aggregator. It is in `join` that an average is taken of the model weights, and the next round can begin.\n",
- "\n",
- ""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "58298e8e-ab9e-4377-966e-143823441697",
- "metadata": {},
- "outputs": [],
- "source": [
- "class FederatedFlow(FLSpec):\n",
- " def __init__(self, model=None, optimizer=None, rounds=3, **kwargs):\n",
- " \"\"\"\n",
- " Initialize the class with the given model, optimizer, and number of rounds.\n",
- "\n",
- " Parameters:\n",
- " model (torch.nn.Module, optional): The model to be used. If None, a ValueError is raised.\n",
- " optimizer (torch.optim.Optimizer, optional): The optimizer to be used.\n",
- " rounds (int, optional): The number of rounds for training or processing (default is 3).\n",
- " **kwargs: Additional keyword arguments to be passed to the superclass initializer.\n",
- "\n",
- " Raises:\n",
- " ValueError: If no model is provided.\n",
- " \"\"\"\n",
- " super().__init__(**kwargs)\n",
- " if model is not None:\n",
- " self.model = model\n",
- " self.peft_params = get_peft_model_state_dict(self.model)\n",
- " self.optimizer = optimizer\n",
- " else:\n",
- " raise ValueError(\"No model inputted\")\n",
- "\n",
- " self.rounds = rounds\n",
- " \n",
- "\n",
- " @aggregator\n",
- " def start(self):\n",
- " \"\"\"\n",
- " Initialize the model and set up the collaborators for federated learning.\n",
- "\n",
- " This method performs the initial setup for the model, including setting the\n",
- " collaborators, initializing private variables, and starting the first round\n",
- " of the federated learning process.\n",
- " \"\"\"\n",
- " print(f\"Performing initialization for model\")\n",
- " self.collaborators = self.runtime.collaborators\n",
- " self.current_round = 0\n",
- " self.next(\n",
- " self.aggregated_model_validation,\n",
- " foreach=\"collaborators\",\n",
- " )\n",
- "\n",
- " \n",
- " @collaborator\n",
- " def aggregated_model_validation(self):\n",
- " \"\"\"\n",
- " Perform aggregated model validation for a collaborator.\n",
- "\n",
- " This method loads the model, applies the PEFT configuration, and evaluates\n",
- " the model using the provided training and evaluation datasets. The validation\n",
- " score is then stored and the next step in the process is triggered.\n",
- " \"\"\"\n",
- " print(f\"Performing aggregated model validation for collaborator {self.input}\")\n",
- " self.model = AutoModelForCausalLM.from_pretrained(\n",
- " checkpoint_path, return_dict=True, **model_kwargs\n",
- " )\n",
- " self.model = get_peft_model(self.model, peft_conf)\n",
- " set_peft_model_state_dict(self.model, self.peft_params)\n",
- " trainer = SFTTrainer(\n",
- " model=self.model,\n",
- " args=train_conf,\n",
- " peft_config=peft_conf,\n",
- " train_dataset=self.train_dataset,\n",
- " eval_dataset=self.eval_dataset,\n",
- " max_seq_length=sequence_max_length,\n",
- " dataset_text_field=\"text\",\n",
- " tokenizer=tokenizer,\n",
- " packing=True,\n",
- " data_collator=transformers.DataCollatorForSeq2Seq(\n",
- " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n",
- " ),\n",
- " )\n",
- "\n",
- " trainer.remove_callback(PrinterCallback)\n",
- " out = trainer.evaluate()\n",
- " self.agg_validation_score = out[\"eval_loss\"]\n",
- " print(f\"{self.input} value of {self.agg_validation_score}\")\n",
- " self.next(self.train)\n",
- "\n",
- " @collaborator\n",
- " def train(self):\n",
- " \"\"\"\n",
- " Train the model for a collaborator.\n",
- "\n",
- " This method trains the model using the provided training and evaluation datasets.\n",
- " The training loss is stored, the model is saved, and the next step in the process\n",
- " is triggered.\n",
- " \"\"\"\n",
- " trainer = SFTTrainer(\n",
- " model=self.model,\n",
- " args=train_conf,\n",
- " peft_config=peft_conf,\n",
- " train_dataset=self.train_dataset,\n",
- " eval_dataset=self.eval_dataset,\n",
- " max_seq_length=sequence_max_length,\n",
- " dataset_text_field=\"text\",\n",
- " tokenizer=tokenizer,\n",
- " packing=True,\n",
- " data_collator=transformers.DataCollatorForSeq2Seq(\n",
- " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n",
- " ),\n",
- " )\n",
- "\n",
- " out = trainer.train()\n",
- " self.loss = out.training_loss\n",
- " trainer.save_model()\n",
- " self.training_completed = True\n",
- " self.next(self.local_model_validation)\n",
- "\n",
- " @collaborator\n",
- " def local_model_validation(self):\n",
- " \"\"\"\n",
- " Perform local model validation for a collaborator.\n",
- "\n",
- " This method evaluates the model using the provided training and evaluation datasets.\n",
- " The validation score is stored, the PEFT parameters are updated, and the next step\n",
- " in the process is triggered.\n",
- " \"\"\"\n",
- " trainer = SFTTrainer(\n",
- " model=self.model,\n",
- " args=train_conf,\n",
- " peft_config=peft_conf,\n",
- " train_dataset=processed_train_dataset,\n",
- " eval_dataset=processed_test_dataset,\n",
- " max_seq_length=sequence_max_length,\n",
- " dataset_text_field=\"text\",\n",
- " tokenizer=tokenizer,\n",
- " packing=True,\n",
- " data_collator=transformers.DataCollatorForSeq2Seq(\n",
- " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n",
- " ),\n",
- " )\n",
- " out = trainer.evaluate()\n",
- " self.local_validation_score = out[\"eval_loss\"]\n",
- " self.peft_params = get_peft_model_state_dict(self.model)\n",
- " print(f\"Doing local model validation for collaborator {self.input}\")\n",
- " self.next(self.join, exclude=[\"training_completed\", \"model\"])\n",
- "\n",
- " @aggregator\n",
- " def join(self, inputs):\n",
- " \"\"\"\n",
- " Aggregate the results from all collaborators and update the model.\n",
- "\n",
- " This method calculates the average loss, aggregated model accuracy, and local model\n",
- " accuracy from all collaborators. The model parameters are updated using Federated\n",
- " Averaging (FedAvg), and the next round of the process is triggered if applicable.\n",
- " \"\"\"\n",
- " self.average_loss = sum(input.loss for input in inputs) / len(inputs)\n",
- " self.aggregated_model_accuracy = sum(\n",
- " input.agg_validation_score for input in inputs\n",
- " ) / len(inputs)\n",
- " self.local_model_accuracy = sum(\n",
- " input.local_validation_score for input in inputs\n",
- " ) / len(inputs)\n",
- " print(\n",
- " f\"Average aggregated model validation values = {self.aggregated_model_accuracy}\"\n",
- " )\n",
- " print(f\"Average training loss = {self.average_loss}\")\n",
- " print(f\"Average local model validation values = {self.local_model_accuracy}\")\n",
- "\n",
- " self.model = FedAvg([input.peft_params for input in inputs], self.model)\n",
- " self.peft_params = get_peft_model_state_dict(self.model)\n",
- "\n",
- " self.model.save_pretrained(\"./aggregated/model\")\n",
- " tokenizer.save_pretrained(\"./aggregated/tokenizer\")\n",
- " self.current_round += 1\n",
- " if self.current_round < self.rounds:\n",
- " self.next(\n",
- " self.aggregated_model_validation,\n",
- " foreach=\"collaborators\",\n",
- " exclude=[\"model\"],\n",
- " )\n",
- " else:\n",
- " self.next(self.end)\n",
- "\n",
- " @aggregator\n",
- " def end(self):\n",
- " \"\"\"\n",
- " End the federated learning process.\n",
- "\n",
- " This method marks the end of the federated learning process and performs any\n",
- " necessary cleanup or finalization steps.\n",
- " \"\"\"\n",
- " print(f\"This is the end of the flow\")\n"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "e120a656-f4a5-47a5-a3d4-62c5f3672bba",
- "metadata": {},
- "source": [
- "You'll notice in the `FederatedFlow` definition above that there were certain attributes that the flow was not initialized with, namely the `train_dataset` and `eval_dataset` for each of the collaborators. These are **private_attributes** that are exposed only through the runtime. Each participant has its own set of private attributes: a dictionary where the key is the attribute name, and the value is the object that will be made accessible through that participant's task.\n",
- "\n",
- "Below, we segment shards of the Math_10k dataset for **two collaborators**: Portland and Seattle. Each has their own slice of the dataset that's accessible via the `train_dataset` or `eval_dataset` attribute. Note that the private attributes are flexible, and you can choose to pass in a completely different type of object to any of the collaborators or aggregator (with an arbitrary name). These private attributes will always be filtered out of the current state when transferring from collaborator to aggregator, or vice versa."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e5e108c6-5150-4931-9c01-6b64a913fa04",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Setup participants\n",
- "_aggregator = Aggregator()\n",
- "_aggregator.private_attributes = {}\n",
- "\n",
- "# Setup collaborators with private attributes\n",
- "collaborator_names = [\n",
- " \"Portland\",\n",
- " \"Seattle\",\n",
- "]\n",
- "_collaborators = [Collaborator(name=name) for name in collaborator_names]\n",
- "\n",
- "for idx, current_collaborator in enumerate(_collaborators):\n",
- " # Set the private attributes of the Collaborator to include their specific training and testing data loaders\n",
- " current_collaborator.private_attributes = {\n",
- " \"train_dataset\": processed_train_dataset.shard(\n",
- " num_shards=len(_collaborators), index=idx\n",
- " ),\n",
- " \"eval_dataset\": processed_test_dataset.shard(\n",
- " num_shards=len(_collaborators), index=idx\n",
- " ),\n",
- " }\n",
- "\n",
- "local_runtime = LocalRuntime(\n",
- " aggregator=_aggregator, collaborators=_collaborators, backend=\"single_process\"\n",
- ")\n",
- "print(f\"Local runtime collaborators = {local_runtime.collaborators}\")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "9cb61fc0",
- "metadata": {},
- "source": [
- "## Run Experiment"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "38894111-41d9-4dd4-b1c8-eb7ec3cdd3e1",
- "metadata": {},
- "outputs": [],
- "source": [
- "flflow = FederatedFlow(model, rounds=2)\n",
- "flflow.runtime = local_runtime\n",
- "flflow.run()\n",
- "\n",
- "# Determine the final model accuracy:\n",
- "print(f'\\nFinal aggregated model accuracy for {flflow.rounds} rounds of training: {flflow.aggregated_model_accuracy}')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7bc8fe27",
- "metadata": {},
- "source": [
- "## 🎉 Congratulations! 🎉\n",
- "\n",
- "Now that you've completed this notebook, check out our [other tutorials](https://github.com/securefederatedai/openfl/tree/develop/openfl-tutorials/experimental/)\n",
- "\n",
- "- Using the LocalRuntime Ray Backend for dedicated GPU access\n",
- "- Vertical Federated Learning\n",
- "- Model Watermarking\n",
- "- Differential Privacy\n",
- "- And More!"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.8"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
From 1a9c1f426e5575e54fda983b9bdc530d5408aafe Mon Sep 17 00:00:00 2001
From: Rajith
Date: Fri, 16 May 2025 11:07:39 +0530
Subject: [PATCH 27/34] removing unwanted files
---
.github/scripts/extract_emails.py | 45 --
.github/scripts/send_email.py | 97 ----
.../workflow/LLM/phi-4-quanti.ipynb | 300 ------------
.../experimental/workflow/LLM/phi-4-sol.ipynb | 438 ------------------
4 files changed, 880 deletions(-)
delete mode 100644 .github/scripts/extract_emails.py
delete mode 100644 .github/scripts/send_email.py
delete mode 100644 openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
delete mode 100644 openfl-tutorials/experimental/workflow/LLM/phi-4-sol.ipynb
diff --git a/.github/scripts/extract_emails.py b/.github/scripts/extract_emails.py
deleted file mode 100644
index 70f17b64a6..0000000000
--- a/.github/scripts/extract_emails.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import re
-import os
-import sys
-import json
-
-def extract_emails(filepath):
- """
- Extract all unique email addresses from the given file.
- """
- email_pattern = r'[\w.+-]+@[\w-]+\.[\w.-]+'
- unique_emails = set()
-
- try:
- with open(filepath, 'r') as file:
- for line in file:
- # Skip comment lines that don't contain emails
- if line.strip().startswith('#') and '@' not in line:
- continue
-
- # Find all email addresses in the line
- emails = re.findall(email_pattern, line)
- unique_emails.update(emails)
- except Exception as e:
- print(f"Error processing {filepath}: {str(e)}", file=sys.stderr)
-
- return sorted(unique_emails)
-
-if __name__ == "__main__":
- # Check CODEOWNERS in standard locations
- codeowners_path = None
- for path in ['.github/CODEOWNERS', 'CODEOWNERS', 'docs/CODEOWNERS']:
- if os.path.exists(path):
- codeowners_path = path
- break
-
- result = {
- "emails": [],
- "codeowners_path": codeowners_path
- }
-
- if codeowners_path:
- emails = extract_emails(codeowners_path)
- result["emails"] = emails
-
- print(json.dumps(result))
diff --git a/.github/scripts/send_email.py b/.github/scripts/send_email.py
deleted file mode 100644
index 4c151e30fc..0000000000
--- a/.github/scripts/send_email.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import os
-import smtplib
-import logging
-import argparse
-from email.message import EmailMessage
-from email.mime.base import MIMEBase
-from email import encoders
-
-logger = logging.getLogger(__name__)
-
-def send_email(sender_email: str, to_email: str, subject: str, email_body: str, smtp_user: str, smtp_pwd: str,
- smtp_email_server: str, cc_email: str = '', bcc_email: str = '', reply_email: str = '', is_html_body: bool = False,
- attachments: str = '') -> None:
-
- message = EmailMessage()
- message["Subject"] = subject
- message["From"] = sender_email
- if to_email:
- to_list = to_email.split(",")
- message["To"] = ", ".join(to_list)
- if cc_email:
- cc_list = cc_email.split(",")
- message["Cc"] = ", ".join(cc_list)
- if reply_email:
- message["Reply-To"] = reply_email
- sub_type = 'plain'
- if is_html_body:
- sub_type = 'html'
- message.set_content(email_body, subtype=sub_type)
- # Set up attachment if any
- if attachments:
- for attachment in attachments.split(','):
- with open(attachment, 'rb') as attachment_file:
- attachment_data = attachment_file.read()
- message.add_attachment(
- attachment_data,
- maintype='application',
- subtype='octet-stream',
- filename=os.path.basename(attachment)
- )
- logger.info(f'Setting smtp server {smtp_email_server}...')
- smtp_server = smtplib.SMTP(smtp_email_server)
- smtp_server.starttls()
- smtp_server.login(smtp_user, smtp_pwd)
- logger.info(f'smtp server authentication successful')
- try:
- logger.info(f'Sending email...')
- if bcc_email:
- # Send bcc list as an argument instead of adding it to the header to keep it hidden
- bcc_list = bcc_email.split(",")
- smtp_server.send_message(message, bcc=bcc_list)
- else:
- smtp_server.send_message(message)
- logger.info(f'email sent.')
- except Exception as ex:
- raise ex
- finally:
- try:
- smtp_server.quit()
- except smtplib.SMTPServerDisconnected:
- pass
- finally:
- logger.info("smtp connection is closed")
-
-def main():
- parser = argparse.ArgumentParser(description="Send an email with optional attachments")
- parser.add_argument('--sender', required=True, help='Sender email address')
- parser.add_argument('--to', required=True, help='Recipient email address(es) (comma-separated)')
- parser.add_argument('--subject', required=True, help='Email subject')
- parser.add_argument('--body', required=True, help='Email body')
- parser.add_argument('--smtp-user', required=True, help='SMTP server username')
- parser.add_argument('--smtp-pwd', required=True, help='SMTP server password')
- parser.add_argument('--smtp-server', required=True, help='SMTP server address and port')
- parser.add_argument('--cc', default='', help='CC email address(es) (comma-separated)')
- parser.add_argument('--bcc', default='', help='BCC email address(es) (comma-separated)')
- parser.add_argument('--reply-to', default='', help='Reply-To email address')
- parser.add_argument('--html-body', action='store_true', help='Flag to indicate if email body is HTML')
- parser.add_argument('--attachments', default='', help='Attachment file path(s) (space-separated)')
- args = parser.parse_args()
-
- send_email(
- sender_email=args.sender,
- to_email=args.to,
- subject=args.subject,
- email_body=args.body,
- smtp_user=args.smtp_user,
- smtp_pwd=args.smtp_pwd,
- smtp_email_server=args.smtp_server,
- cc_email=args.cc,
- bcc_email=args.bcc,
- reply_email=args.reply_to,
- is_html_body=args.html_body,
- attachments=args.attachments
- )
-
-if __name__ == '__main__':
- main()
diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
deleted file mode 100644
index 59ae25d98a..0000000000
--- a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb
+++ /dev/null
@@ -1,300 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "a59f475d-d843-46bc-b75e-10984b687ed3",
- "metadata": {},
- "source": [
- "# Federated Fine-Tuning of Phi-4 with 8-bit Quantization"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "20c74cb9-51a2-42e2-893f-d280e227e8bf",
- "metadata": {},
- "source": [
- "This notebook demonstrates federated fine-tuning of Microsoft's Phi-4 model (4B parameters) with 8-bit quantization using OpenFL."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "05b2ad75-8c7b-499c-902e-dbd5b24361bc",
- "metadata": {},
- "outputs": [],
- "source": [
- "!pip install torch transformers peft datasets trl==0.12.2 bitsandbytes accelerate -q"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "be4690ae-0671-4d3a-8f21-620ab865a03e",
- "metadata": {},
- "outputs": [],
- "source": [
- "import torch\n",
- "import transformers\n",
- "from transformers import (\n",
- " AutoModelForCausalLM,\n",
- " AutoTokenizer,\n",
- " BitsAndBytesConfig,\n",
- " TrainingArguments\n",
- ")\n",
- "from peft import LoraConfig, get_peft_model\n",
- "from datasets import load_dataset\n",
- "from trl import SFTTrainer\n",
- "import numpy as np\n",
- "from openfl.experimental.workflow.interface import Aggregator, Collaborator, FLSpec\n",
- "from openfl.experimental.workflow.placement import aggregator, collaborator\n",
- "from openfl.experimental.workflow.runtime import LocalRuntime"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "d615d626-8727-4169-b2a6-3ba15c3cdb95",
- "metadata": {},
- "outputs": [],
- "source": [
- "# 8-bit quantization config for Phi-4\n",
- "quant_config = BitsAndBytesConfig(\n",
- " load_in_8bit=True,\n",
- " llm_int8_threshold=6.0,\n",
- " llm_int8_skip_modules=None,\n",
- " llm_int8_enable_fp32_cpu_offload=False,\n",
- " llm_int8_has_fp16_weight=False\n",
- ")\n",
- "\n",
- "# Model config for Phi-4\n",
- "model_kwargs = {\n",
- " \"quantization_config\": quant_config,\n",
- " \"device_map\": \"auto\",\n",
- " \"trust_remote_code\": True,\n",
- " \"torch_dtype\": torch.bfloat16\n",
- "}\n",
- "\n",
- "# PEFT config optimized for Phi-4\n",
- "peft_config = LoraConfig(\n",
- " r=16, # Higher rank for larger model\n",
- " lora_alpha=32,\n",
- " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
- " lora_dropout=0.05,\n",
- " bias=\"none\",\n",
- " task_type=\"CAUSAL_LM\"\n",
- ")\n",
- "\n",
- "# Training config adjusted for Phi-4\n",
- "training_config = TrainingArguments(\n",
- " output_dir=\"./results\",\n",
- " per_device_train_batch_size=1, # Reduced for 4B model\n",
- " per_device_eval_batch_size=1,\n",
- " gradient_accumulation_steps=8, # Increased for memory efficiency\n",
- " learning_rate=1e-5, # Lower learning rate for larger model\n",
- " logging_steps=10,\n",
- " num_train_epochs=1,\n",
- " max_grad_norm=0.3,\n",
- " warmup_ratio=0.03,\n",
- " lr_scheduler_type=\"cosine\",\n",
- " save_steps=100,\n",
- " bf16=True, # Using bfloat16 for Phi-4\n",
- " optim=\"adamw_torch\",\n",
- " report_to=\"none\"\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "8ab371f1-64c3-4225-82e7-fb3c5b05578c",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Load Phi-4 model and tokenizer\n",
- "model = AutoModelForCausalLM.from_pretrained(\"microsoft/phi-4\", **model_kwargs)\n",
- "model = get_peft_model(model, peft_config)\n",
- "tokenizer = AutoTokenizer.from_pretrained(\"microsoft/phi-4\")\n",
- "tokenizer.pad_token = tokenizer.eos_token\n",
- "tokenizer.padding_side = \"left\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "4392ddab-10b7-41f6-a8e0-65ba298ea457",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Dataset preparation for Phi-4\n",
- "def format_instruction(sample):\n",
- " return f\"\"\"### Instruction:\n",
- "{sample['instruction']}\n",
- "\n",
- "### Input:\n",
- "{sample['input']}\n",
- "\n",
- "### Response:\n",
- "{sample['output']}\"\"\"\n",
- "\n",
- "dataset = load_dataset(\"json\", data_files=\"math_10k.json\")[\"train\"].train_test_split(test_size=0.1)\n",
- "train_data = dataset[\"train\"].shuffle().select(range(50)) # Smaller subset for Phi-4\n",
- "val_data = dataset[\"test\"].shuffle().select(range(10))\n",
- "\n",
- "train_data = train_data.map(lambda x: {\"text\": format_instruction(x)})\n",
- "val_data = val_data.map(lambda x: {\"text\": format_instruction(x)})"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "6dc85c57-68b2-4514-9373-43e3d7c05c10",
- "metadata": {},
- "outputs": [],
- "source": [
- "class FederatedFlow(FLSpec):\n",
- " def __init__(self, model=None, rounds=3, **kwargs):\n",
- " super().__init__(**kwargs)\n",
- " self.model = model\n",
- " self.rounds = rounds\n",
- " self.training_metrics = []\n",
- " \n",
- " @aggregator\n",
- " def start(self):\n",
- " print(\"Starting federated training for Phi-4\")\n",
- " self.collaborators = self.runtime.collaborators\n",
- " self.current_round = 0\n",
- " self.next(self.aggregated_model_validation, foreach=\"collaborators\")\n",
- " \n",
- " @collaborator\n",
- " def aggregated_model_validation(self):\n",
- " trainer = SFTTrainer(\n",
- " model=self.model,\n",
- " args=training_config,\n",
- " train_dataset=self.train_data,\n",
- " eval_dataset=self.val_data,\n",
- " dataset_text_field=\"text\",\n",
- " max_seq_length=512,\n",
- " tokenizer=tokenizer\n",
- " )\n",
- " metrics = trainer.evaluate()\n",
- " self.validation_loss = metrics[\"eval_loss\"]\n",
- " self.next(self.train)\n",
- " \n",
- " @collaborator\n",
- " def train(self):\n",
- " trainer = SFTTrainer(\n",
- " model=self.model,\n",
- " args=training_config,\n",
- " train_dataset=self.train_data,\n",
- " eval_dataset=self.val_data,\n",
- " dataset_text_field=\"text\",\n",
- " max_seq_length=512,\n",
- " tokenizer=tokenizer\n",
- " )\n",
- " train_result = trainer.train()\n",
- " self.training_loss = train_result.training_loss\n",
- " self.training_metrics.append({\n",
- " \"round\": self.current_round,\n",
- " \"loss\": self.training_loss,\n",
- " \"collaborator\": self.input\n",
- " })\n",
- " self.next(self.local_model_validation)\n",
- " \n",
- " @collaborator\n",
- " def local_model_validation(self):\n",
- " trainer = SFTTrainer(\n",
- " model=self.model,\n",
- " args=training_config,\n",
- " train_dataset=self.train_data,\n",
- " eval_dataset=self.val_data,\n",
- " dataset_text_field=\"text\",\n",
- " max_seq_length=512,\n",
- " tokenizer=tokenizer\n",
- " )\n",
- " metrics = trainer.evaluate()\n",
- " self.local_validation_loss = metrics[\"eval_loss\"]\n",
- " self.next(self.join, exclude=[\"model\"])\n",
- " \n",
- " @aggregator\n",
- " def join(self, inputs):\n",
- " avg_loss = sum(input.training_loss for input in inputs) / len(inputs)\n",
- " avg_val_loss = sum(input.validation_loss for input in inputs) / len(inputs)\n",
- " \n",
- " print(f\"Round {self.current_round} - Avg Training Loss: {avg_loss:.4f}\")\n",
- " print(f\"Round {self.current_round} - Avg Validation Loss: {avg_val_loss:.4f}\")\n",
- " \n",
- " # Aggregate model updates\n",
- " self.current_round += 1\n",
- " if self.current_round < self.rounds:\n",
- " self.next(self.aggregated_model_validation, foreach=\"collaborators\")\n",
- " else:\n",
- " self.next(self.end)\n",
- " \n",
- " @aggregator\n",
- " def end(self):\n",
- " print(\"Phi-4 Training complete!\")\n",
- " print(\"Final Training Metrics:\")\n",
- " for metric in self.training_metrics:\n",
- " print(f\"Round {metric['round']} - {metric['collaborator']} - Loss: {metric['loss']:.4f}\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e5e108c6-5150-4931-9c01-6b64a913fa04",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Setup runtime for Phi-4\n",
- "aggregator = Aggregator()\n",
- "collaborators = [\n",
- " Collaborator(name=\"Portland\", private_attributes={\"train_data\": train_data.shard(2, 0), \"val_data\": val_data.shard(2, 0)}),\n",
- " Collaborator(name=\"Seattle\", private_attributes={\"train_data\": train_data.shard(2, 1), \"val_data\": val_data.shard(2, 1)})\n",
- "]\n",
- "\n",
- "runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators, backend=\"single_process\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "38894111-41d9-4dd4-b1c8-eb7ec3cdd3e1",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Run training for Phi-4\n",
- "flow = FederatedFlow(model, rounds=2)\n",
- "flow.runtime = runtime\n",
- "flow.run()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7bc8fe27",
- "metadata": {},
- "source": [
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.8"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-sol.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-sol.ipynb
deleted file mode 100644
index b1ca1bac6e..0000000000
--- a/openfl-tutorials/experimental/workflow/LLM/phi-4-sol.ipynb
+++ /dev/null
@@ -1,438 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "a59f475d-d843-46bc-b75e-10984b687ed3",
- "metadata": {},
- "source": [
- "# Enhanced Federated Fine-Tuning of Phi-4 Using OpenFL with PEFT & Quantization"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "20c74cb9-51a2-42e2-893f-d280e227e8bf",
- "metadata": {},
- "source": [
- "In this tutorial, we demonstrate how to fine-tune Microsoft's Phi-4 model in a federated learning workflow with enhanced local training using:\n",
- "- Parameter-Efficient Fine-Tuning (PEFT)\n",
- "- 4-bit Quantization (QLoRA)\n",
- "- Gradient Checkpointing\n",
- "- Optimized Training Configuration"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "d07c32d3-1a8d-4162-af45-bc3a10e0ae3f",
- "metadata": {},
- "source": [
- "## Installation"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "05b2ad75-8c7b-499c-902e-dbd5b24361bc",
- "metadata": {},
- "outputs": [],
- "source": [
- "!pip install torch transformers peft datasets trl==0.12.2 bitsandbytes accelerate -q"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "440a9c39-ec42-45a5-80f6-9a9e0bc90d2f",
- "metadata": {},
- "source": [
- "## Import Libraries"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "be4690ae-0671-4d3a-8f21-620ab865a03e",
- "metadata": {},
- "outputs": [],
- "source": [
- "import os\n",
- "import torch\n",
- "from transformers import (\n",
- " AutoModelForCausalLM,\n",
- " AutoTokenizer,\n",
- " BitsAndBytesConfig,\n",
- " TrainingArguments\n",
- ")\n",
- "from peft import (\n",
- " LoraConfig,\n",
- " get_peft_model,\n",
- " prepare_model_for_kbit_training,\n",
- " PeftModel\n",
- ")\n",
- "from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict # Added this import\n",
- "from datasets import load_dataset\n",
- "from trl import SFTTrainer\n",
- "from openfl.experimental.workflow import FLSpec, Aggregator, Collaborator, LocalRuntime\n",
- "import numpy as np"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "08576aa0-f628-4ae6-8fc3-dd167d164784",
- "metadata": {},
- "source": [
- "## Configuration"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "eada9809-468a-47c6-9b03-55aa887c9487",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Model and dataset\n",
- "model_name = \"microsoft/phi-4\"\n",
- "dataset_name = \"math_10k.json\"\n",
- "\n",
- "# QLoRA configuration\n",
- "bnb_config = BitsAndBytesConfig(\n",
- " load_in_4bit=True,\n",
- " bnb_4bit_quant_type=\"nf4\",\n",
- " bnb_4bit_compute_dtype=torch.bfloat16,\n",
- " bnb_4bit_use_double_quant=True,\n",
- ")\n",
- "\n",
- "# LoRA configuration\n",
- "peft_config = LoraConfig(\n",
- " r=16, # Increased from original for better adaptation\n",
- " lora_alpha=32,\n",
- " lora_dropout=0.05,\n",
- " bias=\"none\",\n",
- " task_type=\"CAUSAL_LM\",\n",
- " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"dense\"],\n",
- ")\n",
- "\n",
- "# Training configuration\n",
- "training_args = TrainingArguments(\n",
- " output_dir=\"./results\",\n",
- " num_train_epochs=1,\n",
- " per_device_train_batch_size=1, # Reduced for Phi-4\n",
- " gradient_accumulation_steps=2,\n",
- " optim=\"paged_adamw_32bit\",\n",
- " save_steps=100,\n",
- " logging_steps=10,\n",
- " learning_rate=2e-4,\n",
- " weight_decay=0.001,\n",
- " fp16=False,\n",
- " bf16=True,\n",
- " max_grad_norm=0.3,\n",
- " warmup_ratio=0.03,\n",
- " lr_scheduler_type=\"cosine\",\n",
- " gradient_checkpointing=True,\n",
- " report_to=\"none\"\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "ffe93234-2a1a-4809-a431-efe2f35ce496",
- "metadata": {},
- "source": [
- "## Load and Prepare Model"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "8ab371f1-64c3-4225-82e7-fb3c5b05578c",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Load tokenizer\n",
- "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
- "tokenizer.pad_token = tokenizer.eos_token\n",
- "tokenizer.padding_side = \"right\"\n",
- "\n",
- "# Load model with quantization\n",
- "model = AutoModelForCausalLM.from_pretrained(\n",
- " model_name,\n",
- " quantization_config=bnb_config,\n",
- " device_map=\"auto\",\n",
- " trust_remote_code=True\n",
- ")\n",
- "\n",
- "# Prepare model for k-bit training\n",
- "model = prepare_model_for_kbit_training(model)\n",
- "\n",
- "# Apply LoRA\n",
- "model = get_peft_model(model, peft_config)\n",
- "model.print_trainable_parameters()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "dd058fff-f6dd-4cc6-acaf-7e2fa2c1132d",
- "metadata": {},
- "source": [
- "## Load and Prepare Dataset"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "4392ddab-10b7-41f6-a8e0-65ba298ea457",
- "metadata": {},
- "outputs": [],
- "source": [
- "def format_prompt(example):\n",
- " if example[\"input\"]:\n",
- " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n",
- "\n",
- "### Instruction:\n",
- "{example['instruction']}\n",
- "\n",
- "### Input:\n",
- "{example['input']}\n",
- "\n",
- "### Response:\n",
- "{example['output']}\"\"\"\n",
- " else:\n",
- " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n",
- "\n",
- "### Instruction:\n",
- "{example['instruction']}\n",
- "\n",
- "### Response:\n",
- "{example['output']}\"\"\"\n",
- "\n",
- "# Load dataset\n",
- "dataset = load_dataset(\"json\", data_files=dataset_name, split=\"train\")\n",
- "dataset = dataset.map(lambda x: {\"text\": format_prompt(x)})\n",
- "\n",
- "# Split dataset\n",
- "dataset = dataset.train_test_split(test_size=0.1)\n",
- "train_dataset = dataset[\"train\"]\n",
- "eval_dataset = dataset[\"test\"]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "812cfcc8-33ec-4a2b-8a74-27bfc2a41d7b",
- "metadata": {},
- "source": [
- "## Enhanced Training with SFTTrainer"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "6dc85c57-68b2-4514-9373-43e3d7c05c10",
- "metadata": {},
- "outputs": [],
- "source": [
- "trainer = SFTTrainer(\n",
- " model=model,\n",
- " train_dataset=train_dataset,\n",
- " eval_dataset=eval_dataset,\n",
- " peft_config=peft_config,\n",
- " dataset_text_field=\"text\",\n",
- " max_seq_length=1024,\n",
- " tokenizer=tokenizer,\n",
- " args=training_args,\n",
- " packing=True,\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "810eb75e",
- "metadata": {},
- "source": [
- "## Federated Averaging Function"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "58298e8e-ab9e-4377-966e-143823441697",
- "metadata": {},
- "outputs": [],
- "source": [
- "def FedAvg(peft_params, model, weights=None):\n",
- " \"\"\"\n",
- " Perform Federated Averaging (FedAvg) on the model parameters.\n",
- " \"\"\"\n",
- " state_dicts = peft_params\n",
- " state_dict = get_peft_model_state_dict(model)\n",
- " for key in peft_params[0]:\n",
- " dtype = state_dicts[0][key].dtype\n",
- " state_dict[key] = torch.from_numpy(\n",
- " np.average(\n",
- " [state[key].to(torch.float).numpy() for state in state_dicts], \n",
- " axis=0, \n",
- " weights=weights\n",
- " )\n",
- " ).to(dtype)\n",
- " set_peft_model_state_dict(model, state_dict)\n",
- " return model"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "e120a656-f4a5-47a5-a3d4-62c5f3672bba",
- "metadata": {},
- "source": [
- "## Federated Learning Workflow"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e5e108c6-5150-4931-9c01-6b64a913fa04",
- "metadata": {},
- "outputs": [],
- "source": [
- "class FederatedFlow(FLSpec):\n",
- " def __init__(self, model=None, rounds=3, **kwargs):\n",
- " super().__init__(**kwargs)\n",
- " if model is not None:\n",
- " self.model = model\n",
- " self.peft_params = get_peft_model_state_dict(self.model)\n",
- " else:\n",
- " raise ValueError(\"No model provided\")\n",
- " \n",
- " self.rounds = rounds\n",
- " \n",
- " @aggregator\n",
- " def start(self):\n",
- " print(\"Initializing federated learning\")\n",
- " self.collaborators = self.runtime.collaborators\n",
- " self.current_round = 0\n",
- " self.next(self.aggregated_model_validation, foreach=\"collaborators\")\n",
- " \n",
- " @collaborator\n",
- " def aggregated_model_validation(self):\n",
- " print(f\"Validating aggregated model for {self.input}\")\n",
- " # Load model with quantization\n",
- " self.model = AutoModelForCausalLM.from_pretrained(\n",
- " model_name,\n",
- " quantization_config=bnb_config,\n",
- " device_map=\"auto\",\n",
- " trust_remote_code=True\n",
- " )\n",
- " self.model = prepare_model_for_kbit_training(self.model)\n",
- " self.model = get_peft_model(self.model, peft_config)\n",
- " set_peft_model_state_dict(self.model, self.peft_params)\n",
- " \n",
- " # Evaluate\n",
- " eval_results = trainer.evaluate()\n",
- " self.agg_validation_score = eval_results[\"eval_loss\"]\n",
- " print(f\"Validation loss: {self.agg_validation_score}\")\n",
- " self.next(self.train)\n",
- " \n",
- " @collaborator\n",
- " def train(self):\n",
- " print(f\"Training on {self.input}\")\n",
- " # Train with local data\n",
- " trainer.train()\n",
- " self.loss = trainer.state.log_history[-1][\"loss\"]\n",
- " self.next(self.local_model_validation)\n",
- " \n",
- " @collaborator\n",
- " def local_model_validation(self):\n",
- " print(f\"Validating local model for {self.input}\")\n",
- " eval_results = trainer.evaluate()\n",
- " self.local_validation_score = eval_results[\"eval_loss\"]\n",
- " self.peft_params = get_peft_model_state_dict(self.model)\n",
- " self.next(self.join, exclude=[\"model\"])\n",
- " \n",
- " @aggregator\n",
- " def join(self, inputs):\n",
- " self.average_loss = sum(input.loss for input in inputs) / len(inputs)\n",
- " self.aggregated_model_accuracy = sum(\n",
- " input.agg_validation_score for input in inputs\n",
- " ) / len(inputs)\n",
- " self.local_model_accuracy = sum(\n",
- " input.local_validation_score for input in inputs\n",
- " ) / len(inputs)\n",
- " \n",
- " print(f\"Round {self.current_round + 1} results:\")\n",
- " print(f\"Average training loss: {self.average_loss}\")\n",
- " print(f\"Average validation loss (before training): {self.aggregated_model_accuracy}\")\n",
- " print(f\"Average validation loss (after training): {self.local_model_accuracy}\")\n",
- " \n",
- " # Federated averaging\n",
- " self.model = FedAvg([input.peft_params for input in inputs], self.model)\n",
- " self.peft_params = get_peft_model_state_dict(self.model)\n",
- " \n",
- " self.current_round += 1\n",
- " if self.current_round < self.rounds:\n",
- " self.next(self.aggregated_model_validation, foreach=\"collaborators\")\n",
- " else:\n",
- " self.next(self.end)\n",
- " \n",
- " @aggregator\n",
- " def end(self):\n",
- " print(\"Federated training complete!\")\n",
- " print(f\"Final model validation loss: {self.aggregated_model_accuracy}\")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7bc8fe27",
- "metadata": {},
- "source": [
- "## Run Federated Learning"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "38894111-41d9-4dd4-b1c8-eb7ec3cdd3e1",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Setup participants\n",
- "aggregator = Aggregator()\n",
- "collaborators = [\n",
- " Collaborator(name=\"Portland\"),\n",
- " Collaborator(name=\"Seattle\"),\n",
- " Collaborator(name=\"London\")\n",
- "]\n",
- "\n",
- "# Assign data shards\n",
- "for idx, colab in enumerate(collaborators):\n",
- " colab.private_attributes = {\n",
- " \"train_dataset\": train_dataset.shard(len(collaborators), idx),\n",
- " \"eval_dataset\": eval_dataset.shard(len(collaborators), idx)\n",
- " }\n",
- "\n",
- "# Create and run workflow\n",
- "runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators)\n",
- "flflow = FederatedFlow(model, rounds=3)\n",
- "flflow.runtime = runtime\n",
- "flflow.run()"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.12"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
From 5321986b45a7ab381a8e9851725360203bc49dc0 Mon Sep 17 00:00:00 2001
From: Rajith
Date: Fri, 16 May 2025 11:12:37 +0530
Subject: [PATCH 28/34] reverting local changes
---
.github/workflows/trivy.yml | 243 +++++++++++++-----------------------
CODEOWNERS | 2 +-
2 files changed, 85 insertions(+), 160 deletions(-)
diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml
index 7c9b79761a..62b196f3d1 100644
--- a/.github/workflows/trivy.yml
+++ b/.github/workflows/trivy.yml
@@ -1,4 +1,4 @@
-name: Trivy Nightly Security Scan
+name: Trivy Nightly Scan
on:
workflow_call:
inputs:
@@ -7,192 +7,117 @@ on:
type: string
workflow_dispatch:
schedule:
- - cron: '0 0 * * *' # Runs daily at midnight UTC
+ - cron: '0 0 * * *' # This runs the workflow every night at midnight UTC
jobs:
- security-scan:
+ build:
if: github.event.pull_request.draft == false
permissions:
- contents: read
- security-events: write
- actions: read
- packages: read
- issues: write
+ contents: read # for actions/checkout to fetch code
+ security-events: write # for github/codeql-action/upload-sarif to upload SARIF results
+ actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status
runs-on: ubuntu-22.04
- timeout-minutes: 45
+ timeout-minutes: 15
+
env:
- TRIVY_VERSION: 0.50.1
+ TRIVY_DB_REPOSITORY: 'ghcr.io/aquasecurity/trivy-db,public.ecr.aws/aquasecurity/trivy-db'
COMMIT_ID: ${{ inputs.commit_id || github.sha }}
steps:
- # ============ SETUP PHASE ============
- - name: Checkout repository
+ - name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ env.COMMIT_ID }}
- fetch-depth: 0
- - name: Set up Python
- uses: actions/setup-python@v4
- with:
- python-version: '3.10'
+ - name: Install Trivy
+ run: |
+ curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sudo sh -s -- -b /usr/local/bin
- # ============ SCANNING PHASE ============
- - name: Run filesystem scan
- uses: aquasecurity/trivy-action@0.30.0
+ - name: Run Trivy code vulnerability scanner (JSON Output)
+ run: |
+ trivy --quiet fs \
+ --format json \
+ --output trivy-code-results.json \
+ --ignore-unfixed \
+ --vuln-type os,library \
+ --severity CRITICAL,HIGH,MEDIUM \
+ .
+
+ - name: Display Trivy code Scan Results
+ if: failure() # Ensure this step runs regardless of the previous step's outcome
+ run: |
+ echo "Trivy Scan Results:"
+ cat trivy-code-results.json | jq '.Results[] | select(.Vulnerabilities != null) | .Vulnerabilities[] | {VulnerabilityID, PkgName, InstalledVersion, Severity, Description}'
+
+ - name: Upload Code Vulnerability Scan Results
+ uses: actions/upload-artifact@v4
with:
- scan-type: 'fs'
- format: 'json'
- output: 'trivy-fs-report.json'
- severity: 'CRITICAL,HIGH'
- ignore-unfixed: true
- vuln-type: 'os,library'
- security-checks: 'vuln'
+ name: trivy-code-report-json
+ path: trivy-code-results.json
- - name: Build Docker image
+ - name: Build an image from Dockerfile
run: |
- docker buildx build \
- --pull \
- --tag local/scan-target:${{ github.run_id }} \
- --file openfl-docker/Dockerfile.base \
- --load \
- .
+ docker build --pull -t docker.io/securefederatedai/openfl:${{ github.sha }} -f openfl-docker/Dockerfile.base .
- - name: Scan Docker image
+ - name: Run Trivy vulnerability scanner for Docker image (JSON Output)
+ id: trivy-scan
uses: aquasecurity/trivy-action@0.30.0
with:
- image-ref: 'local/scan-target:${{ github.run_id }}'
+ image-ref: 'docker.io/securefederatedai/openfl:${{ github.sha }}'
format: 'json'
- output: 'trivy-image-report.json'
- severity: 'CRITICAL,HIGH'
+ output: 'trivy-docker-results.json'
+ exit-code: '1'
ignore-unfixed: true
vuln-type: 'os,library'
- security-checks: 'vuln'
-
- # ============ REPORTING PHASE ============
- - name: Generate SBOM reports
- run: |
- trivy fs --format spdx-json --output trivy-fs-sbom.json .
- trivy image --format spdx-json --output trivy-image-sbom.json local/scan-target:${{ github.run_id }}
+ severity: 'CRITICAL,HIGH,MEDIUM'
+ trivyignores: '.trivyignore'
- - name: Create consolidated report
- id: report
+ - name: Display Trivy Docker Scan Results
+ if: failure() # Ensure this step runs regardless of the previous step's outcome
run: |
- # Initialize markdown report
- echo "# Security Scan Report - OpenFL" > report.md
- echo "**Scan Date:** $(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> report.md
- echo "**Commit:** [${{ env.COMMIT_ID }}](https://github.com/rajithkrishnegowda/openfl/commit/${{ env.COMMIT_ID }})" >> report.md
- echo -e "\n## Vulnerability Summary\n" >> report.md
-
- # Process filesystem results
- if [ -f "trivy-fs-report.json" ]; then
- FS_VULNS=$(jq '[.Results[]?.Vulnerabilities[]?] | length' trivy-fs-report.json || echo 0)
- echo "### Filesystem Scans" >> report.md
- echo "**Critical/High Vulnerabilities:** $FS_VULNS" >> report.md
-
- if [ "$FS_VULNS" -gt 0 ]; then
- echo -e "\n| Severity | ID | Package | Version | Description |" >> report.md
- echo "|----------|----|---------|---------|-------------|" >> report.md
- jq -r '.Results[]?.Vulnerabilities[]? | "| \(.Severity) | \(.VulnerabilityID) | \(.PkgName) | \(.InstalledVersion) | \(.Title) |"' trivy-fs-report.json >> report.md
- echo "has_vulnerabilities=true" >> $GITHUB_OUTPUT
- fi
- fi
-
- # Process image results
- if [ -f "trivy-image-report.json" ]; then
- IMG_VULNS=$(jq '[.Results[]?.Vulnerabilities[]?] | length' trivy-image-report.json || echo 0)
- echo -e "\n### Container Image Scans" >> report.md
- echo "**Critical/High Vulnerabilities:** $IMG_VULNS" >> report.md
-
- if [ "$IMG_VULNS" -gt 0 ]; then
- echo -e "\n| Severity | ID | Package | Version | Description |" >> report.md
- echo "|----------|----|---------|---------|-------------|" >> report.md
- jq -r '.Results[]?.Vulnerabilities[]? | "| \(.Severity) | \(.VulnerabilityID) | \(.PkgName) | \(.InstalledVersion) | \(.Title) |"' trivy-image-report.json >> report.md
- echo "has_vulnerabilities=true" >> $GITHUB_OUTPUT
- fi
- fi
-
- # Add artifact download links
- echo -e "\n## Next Steps\n" >> report.md
- echo "1. Review the full reports in the workflow artifacts" >> report.md
- echo "2. Address critical vulnerabilities immediately" >> report.md
- echo "3. Create GitHub issues for tracking remediation" >> report.md
-
- cat report.md
-
- # ============ NOTIFICATION PHASE ============
- - name: Set notification subject
- id: set-subject
- run: |
- if [[ "${{ job.status }}" == "failure" ]]; then
- echo "subject=🚨 OpenFL Security Scan Failed" >> $GITHUB_OUTPUT
- elif [[ "${{ steps.report.outputs.has_vulnerabilities }}" == "true" ]]; then
- echo "subject=⚠️ OpenFL Vulnerabilities Found" >> $GITHUB_OUTPUT
+ if [ -s trivy-docker-results.json ]; then
+ echo "Trivy Scan Results:"
+ cat trivy-docker-results.json | jq '.Results[] | select(.Vulnerabilities != null) | .Vulnerabilities[] | {VulnerabilityID, PkgName, InstalledVersion, Severity, Description}'
else
- echo "subject=✅ OpenFL Security Scan Passed" >> $GITHUB_OUTPUT
+ echo "Trivy scan results file is empty or not found."
fi
-
- - name: Extract CODEOWNERS emails
- id: codeowners
- run: |
- if ! command -v python &> /dev/null; then
- sudo apt-get update && sudo apt-get install -y python3
- fi
-
- OUTPUT=$(python .github/scripts/extract_emails.py)
- echo "Extracted emails: $OUTPUT"
-
- EMAILS=$(echo "$OUTPUT" | jq -r '.emails | join(",")')
- echo "emails=${EMAILS:-${{ secrets.SECURITY_EMAIL_RECIPIENTS }}}" >> $GITHUB_OUTPUT
-
- env:
- PYTHONIOENCODING: utf-8
-
- - name: Prepare email content
- id: prepare-email
- run: |
- # Convert markdown to HTML
- python -m pip install markdown
- HTML_CONTENT=$(python -c "import markdown; print(markdown.markdown(open('report.md').read()))")
- echo "html_body<> $GITHUB_OUTPUT
- echo "$HTML_CONTENT" >> $GITHUB_OUTPUT
- echo "EOF" >> $GITHUB_OUTPUT
-
- - name: Send email via Python script
- if: always() && (steps.report.outputs.has_vulnerabilities == 'true' || failure())
- env:
- SMTP_SERVER: ${{ secrets.SMTP_SERVER }}
- SMTP_PORT: ${{ secrets.SMTP_PORT }}
- SMTP_USER: ${{ secrets.SMTP_USER }}
- SMTP_PASSWORD: ${{ secrets.SMTP_PASSWORD }}
- RECIPIENTS: ${{ steps.codeowners.outputs.emails }}
- run: |
- python .github/scripts/send_email.py \
- --sender "security@openfl.github" \
- --to "$RECIPIENTS" \
- --subject "${{ steps.set-subject.outputs.subject }}" \
- --body "${{ steps.prepare-email.outputs.html_body }}" \
- --smtp-user "$SMTP_USER" \
- --smtp-pwd "$SMTP_PASSWORD" \
- --smtp-server "$SMTP_SERVER:$SMTP_PORT" \
- --html-body
-
- # ============ ARTIFACT UPLOADS ============
- - name: Upload scan artifacts
+
+ - name: Upload final Trivy Docker Vulnerability Scan
uses: actions/upload-artifact@v4
with:
- name: security-reports-${{ github.run_id }}
- path: |
- trivy-fs-report.json
- trivy-image-report.json
- trivy-fs-sbom.json
- trivy-image-sbom.json
- report.md
- retention-days: 30
+ name: trivy-docker-report-json
+ path: trivy-docker-results.json
- # ============ FAILURE HANDLING ============
- - name: Fail workflow if vulnerabilities found
- if: steps.report.outputs.has_vulnerabilities == 'true' && github.event_name != 'schedule'
+ - name: Run Trivy code vulnerability scanner (SPDX-JSON Output)
run: |
- echo "::error::Critical/High vulnerabilities detected!"
- exit 1
+ trivy --quiet fs \
+ --format spdx-json \
+ --output trivy-code-spdx-results.json \
+ --ignore-unfixed \
+ --vuln-type os,library \
+ --severity CRITICAL,HIGH,MEDIUM \
+ .
+
+ - name: Upload SPDX Code Vulnerability Scan Results
+ uses: actions/upload-artifact@v4
+ with:
+ name: trivy-code-spdx-report-json
+ path: trivy-code-spdx-results.json
+
+ - name: Run Trivy vulnerability scanner for Docker image (SPDX-JSON Output)
+ uses: aquasecurity/trivy-action@0.30.0
+ with:
+ image-ref: 'docker.io/securefederatedai/openfl:${{ github.sha }}'
+ format: 'spdx-json'
+ output: 'trivy-docker-spdx-results.json'
+ exit-code: '1'
+ ignore-unfixed: true
+ vuln-type: 'os,library'
+ severity: 'CRITICAL,HIGH,MEDIUM'
+ trivyignores: '.trivyignore'
+
+ - name: Upload SPDX Docker Vulnerability Scan
+ uses: actions/upload-artifact@v4
+ with:
+ name: trivy-docker-spdx-report-json
+ path: trivy-docker-spdx-results.json
diff --git a/CODEOWNERS b/CODEOWNERS
index b36cb38d58..cb0b89fc6b 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -30,6 +30,6 @@
/scripts/ aayush.garg@intel.com giribabu.bikki@intel.com karan.shah@intel.com patrick.foley@intel.com srikanth.enugula@intel.com teodor.parvanov@intel.com
# File level ownership
-CODEOWNERS akshay.pant@intel.com karan.shah@intel.com kevin.ta@intel.com noopur@intel.com patrick.foley@intel.com payal.chaurasiya@intel.com rahul.garg@intel.com rajith.krishnegowda@intel.com shailesh.pant@intel.com shailesh.tanwar@intel.com teodor.parvanov@intel.com
+CODEOWNERS aayush.garg@intel.com giribabu.bikki@intel.com patrick.foley@intel.com preethi.asokan@intel.com rahul.garg@intel.com srikanth.enugula@intel.com teodor.parvanov@intel.com
test-requirements.txt akshay.pant@intel.com karan.shah@intel.com kevin.ta@intel.com noopur@intel.com patrick.foley@intel.com payal.chaurasiya@intel.com rahul.garg@intel.com rajith.krishnegowda@intel.com shailesh.pant@intel.com shailesh.tanwar@intel.com teodor.parvanov@intel.com
From ce8dbb916fa8c63e4b1385007d0d3bc599a36b74 Mon Sep 17 00:00:00 2001
From: Rajith
Date: Fri, 16 May 2025 15:44:15 +0530
Subject: [PATCH 29/34] added 4bit and 8bit
---
.../workflow/LLM/phi-4-withquantization.ipynb | 4167 +++++++++++++++++
1 file changed, 4167 insertions(+)
create mode 100644 openfl-tutorials/experimental/workflow/LLM/phi-4-withquantization.ipynb
diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-withquantization.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-withquantization.ipynb
new file mode 100644
index 0000000000..8a33373f91
--- /dev/null
+++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-withquantization.ipynb
@@ -0,0 +1,4167 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "a59f475d-d843-46bc-b75e-10984b687ed3",
+ "metadata": {},
+ "source": [
+ "# Enhanced Federated Fine-Tuning of Phi-4 Using OpenFL with PEFT & Quantization"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "20c74cb9-51a2-42e2-893f-d280e227e8bf",
+ "metadata": {},
+ "source": [
+ "In this tutorial, we demonstrate how to fine-tune Microsoft's Phi-4 model in a federated learning workflow with enhanced local training using:\n",
+ "- Parameter-Efficient Fine-Tuning (PEFT)\n",
+ "- 4-bit Quantization (QLoRA)\n",
+ "- Gradient Checkpointing\n",
+ "- Optimized Training Configuration"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d07c32d3-1a8d-4162-af45-bc3a10e0ae3f",
+ "metadata": {},
+ "source": [
+ "## Installation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "05b2ad75-8c7b-499c-902e-dbd5b24361bc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install torch transformers peft datasets trl==0.12.2 bitsandbytes accelerate -q"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "a7ae1a7e-8c16-4c5a-be57-33d84723aed7",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Fri May 16 07:23:10 2025 \n",
+ "+-----------------------------------------------------------------------------------------+\n",
+ "| NVIDIA-SMI 550.144.03 Driver Version: 550.144.03 CUDA Version: 12.4 |\n",
+ "|-----------------------------------------+------------------------+----------------------+\n",
+ "| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
+ "| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n",
+ "| | | MIG M. |\n",
+ "|=========================================+========================+======================|\n",
+ "| 0 NVIDIA H100 NVL Off | 00000001:00:00.0 Off | 0 |\n",
+ "| N/A 41C P0 66W / 400W | 1MiB / 95830MiB | 0% Default |\n",
+ "| | | Disabled |\n",
+ "+-----------------------------------------+------------------------+----------------------+\n",
+ " \n",
+ "+-----------------------------------------------------------------------------------------+\n",
+ "| Processes: |\n",
+ "| GPU GI CI PID Type Process name GPU Memory |\n",
+ "| ID ID Usage |\n",
+ "|=========================================================================================|\n",
+ "| No running processes found |\n",
+ "+-----------------------------------------------------------------------------------------+\n"
+ ]
+ }
+ ],
+ "source": [
+ "!nvidia-smi"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "440a9c39-ec42-45a5-80f6-9a9e0bc90d2f",
+ "metadata": {},
+ "source": [
+ "## Import Libraries"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "be4690ae-0671-4d3a-8f21-620ab865a03e",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/azureuser/env_name/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+ " from .autonotebook import tqdm as notebook_tqdm\n",
+ "2025-05-16 07:23:13,756\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n"
+ ]
+ }
+ ],
+ "source": [
+ "import os\n",
+ "import torch\n",
+ "from transformers import (\n",
+ " AutoModelForCausalLM,\n",
+ " AutoTokenizer,\n",
+ " BitsAndBytesConfig,\n",
+ " TrainingArguments\n",
+ ")\n",
+ "from peft import (\n",
+ " LoraConfig,\n",
+ " get_peft_model,\n",
+ " prepare_model_for_kbit_training,\n",
+ " PeftModel\n",
+ ")\n",
+ "from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict # Added this import\n",
+ "from datasets import load_dataset\n",
+ "from trl import SFTTrainer\n",
+ "from openfl.experimental.workflow.interface import Aggregator, Collaborator, FLSpec\n",
+ "from openfl.experimental.workflow.placement import aggregator, collaborator\n",
+ "from openfl.experimental.workflow.runtime import LocalRuntime\n",
+ "import numpy as np\n",
+ "from transformers.trainer_callback import PrinterCallback\n",
+ "import transformers\n",
+ "import gc\n",
+ "import psutil"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "74fed8f2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Memory optimization setup\n",
+ "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\" # Enable dynamic memory allocation\n",
+ "os.environ[\"TRANSFORMERS_ATTN_IMPLEMENTATION\"] = \"flash_attention_2\" # Use optimized attention\n",
+ "\n",
+ "def clear_gpu():\n",
+ " torch.cuda.empty_cache()\n",
+ " gc.collect()\n",
+ "\n",
+ "clear_gpu()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "813b4917",
+ "metadata": {},
+ "source": [
+ "## Acquiring and preprocessing dataset\n",
+ "\n",
+ "We can download the dataset directly from the [LLM-Adapters repository](https://github.com/AGI-Edgerunners/LLM-Adapters)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6df7bfb4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Import libraries needed for downloading and verifying the dataset\n",
+ "import hashlib\n",
+ "import requests\n",
+ "\n",
+ "def file_checksum(file_path, algorithm=\"sha256\"):\n",
+ " \"\"\"\n",
+ " Calculate the checksum of a file using the specified hashing algorithm.\n",
+ " \n",
+ " Args:\n",
+ " file_path (str): The path to the file for which the checksum is to be calculated.\n",
+ " algorithm (str): The hashing algorithm to use (default is 'sha256').\n",
+ " \n",
+ " Returns:\n",
+ " str: The calculated checksum of the file.\n",
+ " \"\"\"\n",
+ " hash_func = hashlib.new(algorithm)\n",
+ " with open(file_path, \"rb\") as f:\n",
+ " for chunk in iter(lambda: f.read(4096), b\"\"):\n",
+ " hash_func.update(chunk)\n",
+ " return hash_func.hexdigest()\n",
+ "\n",
+ "\n",
+ "# Download the dataset if it doesn't exist locally\n",
+ "if not os.path.exists(\"math_10k.json\"):\n",
+ " print(\"Downloading math_10k.json dataset...\")\n",
+ " r = requests.get(\n",
+ " \"https://raw.githubusercontent.com/AGI-Edgerunners/LLM-Adapters/main/ft-training_set/math_10k.json\",\n",
+ " )\n",
+ " with open(\n",
+ " \"math_10k.json\",\n",
+ " \"wb\",\n",
+ " ) as f:\n",
+ " f.write(r.content)\n",
+ " print(\"Download complete.\")\n",
+ "\n",
+ " # Verify the integrity of the downloaded file\n",
+ " actual_checksum = file_checksum(\"math_10k.json\")\n",
+ " expected_checksum = \"0342d0d860ad8592b579329337c90e42eefd3d9f2898043140cbd120630418b8\"\n",
+ " if actual_checksum != expected_checksum:\n",
+ " raise ValueError(\n",
+ " \"Checksum verification failed. The file may have been altered.\"\n",
+ " )\n",
+ " print(\"Checksum verification successful.\")\n",
+ "else:\n",
+ " print(\"Dataset already exists locally.\")\n",
+ "\n",
+ "# Set the dataset path to be used later\n",
+ "dataset_name = \"math_10k.json\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "08576aa0-f628-4ae6-8fc3-dd167d164784",
+ "metadata": {},
+ "source": [
+ "## Configuration"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "eada9809-468a-47c6-9b03-55aa887c9487",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Model and dataset\n",
+ "model_name = \"microsoft/phi-4\"\n",
+ "#dataset_name = \"math_10k.json\"\n",
+ "\n",
+ "# 4-bit QLoRA configuration\n",
+ "bnb_config_4bit = BitsAndBytesConfig(\n",
+ " load_in_4bit=True,\n",
+ " bnb_4bit_quant_type=\"nf4\",\n",
+ " bnb_4bit_compute_dtype=torch.bfloat16,\n",
+ " bnb_4bit_use_double_quant=False,\n",
+ ")\n",
+ "\n",
+ "# 8-bit QLoRA configuration\n",
+ "bnb_config_8bit = BitsAndBytesConfig(\n",
+ " load_in_8bit=True,\n",
+ " llm_int8_enable_fp32_cpu_offload=True,\n",
+ " llm_int8_skip_modules=['lm_head'],\n",
+ " llm_int8_threshold=6.0,\n",
+ " llm_int8_has_fp16_weight=False,\n",
+ ")\n",
+ "\n",
+ "# Active quantization config (will be set to either 4-bit or 8-bit)\n",
+ "bnb_config = bnb_config_4bit # Default to 4-bit\n",
+ "\n",
+ "# LoRA configuration\n",
+ "peft_config = LoraConfig(\n",
+ " r=8, # Increased from original for better adaptation\n",
+ " lora_alpha=16,\n",
+ " lora_dropout=0.01,\n",
+ " bias=\"none\",\n",
+ " task_type=\"CAUSAL_LM\",\n",
+ " target_modules=\"all-linear\",\n",
+ ")\n",
+ "\n",
+ "# Training configuration\n",
+ "training_args = TrainingArguments(\n",
+ " output_dir=\"./results\",\n",
+ " num_train_epochs=1,\n",
+ " per_device_train_batch_size=2, # Reduced for Phi-4\n",
+ " gradient_accumulation_steps=2,\n",
+ " optim=\"adamw_torch_fused\",\n",
+ " save_steps=100,\n",
+ " logging_steps=10,\n",
+ " learning_rate=3e-4,\n",
+ " weight_decay=0.001,\n",
+ " fp16=False,\n",
+ " bf16=True,\n",
+ " max_grad_norm=0.5,\n",
+ " warmup_ratio=0.02,\n",
+ " lr_scheduler_type=\"cosine\",\n",
+ " gradient_checkpointing=True,\n",
+ " report_to=\"none\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ffe93234-2a1a-4809-a431-efe2f35ce496",
+ "metadata": {},
+ "source": [
+ "## Load and Prepare Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "8ab371f1-64c3-4225-82e7-fb3c5b05578c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|██████████| 6/6 [00:04<00:00, 1.33it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "trainable params: 27,852,800 || all params: 14,687,360,000 || trainable%: 0.1896\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Load tokenizer\n",
+ "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
+ "tokenizer.pad_token = tokenizer.eos_token\n",
+ "tokenizer.padding_side = \"right\"\n",
+ "\n",
+ "# Load model with quantization\n",
+ "model = AutoModelForCausalLM.from_pretrained(\n",
+ " model_name,\n",
+ " quantization_config=bnb_config,\n",
+ " device_map=\"auto\",\n",
+ " trust_remote_code=True\n",
+ ")\n",
+ "\n",
+ "# Prepare model for k-bit training\n",
+ "model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)\n",
+ "\n",
+ "# Apply LoRA\n",
+ "model = get_peft_model(model, peft_config)\n",
+ "model.print_trainable_parameters()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dd058fff-f6dd-4cc6-acaf-7e2fa2c1132d",
+ "metadata": {},
+ "source": [
+ "## Load and Prepare Dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4392ddab-10b7-41f6-a8e0-65ba298ea457",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def format_prompt(example):\n",
+ " \"\"\"\n",
+ " Format the input example into a standardized prompt structure for training.\n",
+ " \n",
+ " This function creates a consistent instruction-response format for the LLM training:\n",
+ " - Includes instruction, optional input, and expected output\n",
+ " - Using a standardized template inspired by instruction-tuning datasets\n",
+ " \n",
+ " Parameters:\n",
+ " example (dict): Dictionary containing 'instruction', 'input' (optional), and 'output' fields\n",
+ " \n",
+ " Returns:\n",
+ " str: Formatted prompt with consistent structure for model training and evaluation\n",
+ " \"\"\"\n",
+ " # Handle case where input is provided\n",
+ " if example[\"input\"]:\n",
+ " # Format with both instruction and input fields\n",
+ " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n",
+ "\n",
+ "### Instruction:\n",
+ "{example['instruction']}\n",
+ "\n",
+ "### Input:\n",
+ "{example['input']}\n",
+ "\n",
+ "### Response:\n",
+ "{example['output']}\"\"\"\n",
+ " else:\n",
+ " # Format with only instruction (no input field)\n",
+ " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n",
+ "\n",
+ "### Instruction:\n",
+ "{example['instruction']}\n",
+ "\n",
+ "### Response:\n",
+ "{example['output']}\"\"\"\n",
+ "\n",
+ "# Load dataset\n",
+ "dataset = load_dataset(\"json\", data_files=dataset_name, split=\"train\", num_proc=4)\n",
+ "# Apply formatting to each example in the dataset\n",
+ "dataset = dataset.map(lambda x: {\"text\": format_prompt(x)}, num_proc=4)\n",
+ "\n",
+ "# Split dataset into training and evaluation subsets\n",
+ "dataset = dataset.train_test_split(test_size=0.1)\n",
+ "train_dataset = dataset[\"train\"]\n",
+ "eval_dataset = dataset[\"test\"]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "812cfcc8-33ec-4a2b-8a74-27bfc2a41d7b",
+ "metadata": {},
+ "source": [
+ "## Enhanced Training with SFTTrainer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "6dc85c57-68b2-4514-9373-43e3d7c05c10",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': dataset_text_field, max_seq_length, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "Generating train split: 1832 examples [00:02, 615.45 examples/s]\n",
+ "Generating train split: 197 examples [00:00, 575.95 examples/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "trainer = SFTTrainer(\n",
+ " model=model,\n",
+ " train_dataset=train_dataset,\n",
+ " eval_dataset=eval_dataset,\n",
+ " peft_config=peft_config,\n",
+ " dataset_text_field=\"text\",\n",
+ " max_seq_length=1024,\n",
+ " tokenizer=tokenizer,\n",
+ " args=training_args,\n",
+ " packing=True,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "810eb75e",
+ "metadata": {},
+ "source": [
+ "## Federated Averaging Function"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "58298e8e-ab9e-4377-966e-143823441697",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def FedAvg(peft_params, model, weights=None):\n",
+ " \"\"\"\n",
+ " Perform Federated Averaging (FedAvg) on the model parameters.\n",
+ "\n",
+ " Parameters:\n",
+ " peft_params (list): A list of state dictionaries containing the model parameters from different clients.\n",
+ " model (torch.nn.Module): The model to which the averaged parameters will be applied.\n",
+ " weights (list, optional): A list of weights for averaging the parameters. If None, equal weights are used.\n",
+ "\n",
+ " Returns:\n",
+ " torch.nn.Module: The model with the averaged parameters applied.\n",
+ " \"\"\"\n",
+ " state_dicts = peft_params\n",
+ " state_dict = get_peft_model_state_dict(model)\n",
+ " for key in peft_params[0]:\n",
+ " dtype = state_dicts[0][key].dtype\n",
+ " state_dict[key] = torch.from_numpy(\n",
+ " np.average(\n",
+ " [state[key].cpu().to(torch.float).numpy() for state in state_dicts], axis=0, weights=weights\n",
+ " )\n",
+ " ).to(dtype)\n",
+ " set_peft_model_state_dict(model, state_dict)\n",
+ " return model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e120a656-f4a5-47a5-a3d4-62c5f3672bba",
+ "metadata": {},
+ "source": [
+ "## Federated Learning Workflow"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e5e108c6-5150-4931-9c01-6b64a913fa04",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Aggregator step \"start\" registered\n",
+ "Collaborator step \"aggregated_model_validation\" registered\n",
+ "Collaborator step \"train\" registered\n",
+ "Collaborator step \"local_model_validation\" registered\n",
+ "Aggregator step \"join\" registered\n",
+ "Aggregator step \"end\" registered\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Import the required PrinterCallback for proper initialization/removal\n",
+ "from transformers.trainer_callback import PrinterCallback\n",
+ "import transformers\n",
+ "import gc\n",
+ "import psutil\n",
+ "\n",
+ "def get_gpu_memory_info():\n",
+ " \"\"\"Get detailed GPU memory usage information in megabytes.\n",
+ " \n",
+ " This function checks for CUDA availability and returns a dictionary with memory allocation\n",
+ " information including allocated, reserved, and maximum allocated GPU memory.\n",
+ " \n",
+ " Returns:\n",
+ " dict: Dictionary with memory usage information in MB:\n",
+ " - allocated: Currently allocated memory by PyTorch tensors\n",
+ " - reserved: Total memory reserved by PyTorch\n",
+ " - max_allocated: Maximum allocated memory since last reset\n",
+ " \n",
+ " Note:\n",
+ " Returns zeros for all metrics if CUDA is not available or if an error occurs.\n",
+ " \"\"\"\n",
+ " try:\n",
+ " if torch.cuda.is_available():\n",
+ " allocated = torch.cuda.memory_allocated() / (1024 * 1024)\n",
+ " reserved = torch.cuda.memory_reserved() / (1024 * 1024)\n",
+ " max_allocated = torch.cuda.max_memory_allocated() / (1024 * 1024)\n",
+ " return {\n",
+ " \"allocated\": allocated,\n",
+ " \"reserved\": reserved,\n",
+ " \"max_allocated\": max_allocated\n",
+ " }\n",
+ " else:\n",
+ " return {\"allocated\": 0, \"reserved\": 0, \"max_allocated\": 0}\n",
+ " except:\n",
+ " return {\"allocated\": 0, \"reserved\": 0, \"max_allocated\": 0}\n",
+ "\n",
+ "class MemoryTracker:\n",
+ " \"\"\"Track GPU memory usage during training\"\"\"\n",
+ " def __init__(self, collaborator_name, quant_type):\n",
+ " self.collaborator_name = collaborator_name\n",
+ " self.quant_type = quant_type\n",
+ " self.timestamps = {}\n",
+ " self.peak = {\"allocated\": 0, \"reserved\": 0, \"max_allocated\": 0}\n",
+ " self.training_loss = None\n",
+ " self.eval_loss = None\n",
+ " \n",
+ " def log(self, timestamp):\n",
+ " \"\"\"Log current memory usage at a specific timestamp\"\"\"\n",
+ " self.timestamps[timestamp] = get_gpu_memory_info()\n",
+ " \n",
+ " def log_loss(self, training_loss=None, eval_loss=None):\n",
+ " \"\"\"Log training or evaluation loss\"\"\"\n",
+ " if training_loss is not None:\n",
+ " self.training_loss = training_loss\n",
+ " if eval_loss is not None:\n",
+ " self.eval_loss = eval_loss\n",
+ " \n",
+ " def update_peak(self):\n",
+ " \"\"\"Update peak memory usage values\"\"\"\n",
+ " current = get_gpu_memory_info()\n",
+ " self.peak[\"allocated\"] = max(self.peak[\"allocated\"], current[\"allocated\"])\n",
+ " self.peak[\"reserved\"] = max(self.peak[\"reserved\"], current[\"reserved\"])\n",
+ " self.peak[\"max_allocated\"] = max(self.peak[\"max_allocated\"], current[\"max_allocated\"])\n",
+ " \n",
+ " def reset_peak(self):\n",
+ " \"\"\"Reset peak memory usage values\"\"\"\n",
+ " self.peak = {\"allocated\": 0, \"reserved\": 0, \"max_allocated\": 0}\n",
+ " \n",
+ " def report(self):\n",
+ " \"\"\"Print memory usage report\"\"\"\n",
+ " print(f\"\\n==== Memory Usage Report for {self.collaborator_name} ({self.quant_type}) ====\")\n",
+ " print(f\"Peak Memory Usage:\")\n",
+ " print(f\" Allocated: {self.peak['allocated']:.2f} MB\")\n",
+ " print(f\" Reserved: {self.peak['reserved']:.2f} MB\")\n",
+ " print(f\" Max Allocated: {self.peak['max_allocated']:.2f} MB\")\n",
+ " \n",
+ " print(\"\\nMemory Usage by Stage:\")\n",
+ " for timestamp, mem in self.timestamps.items():\n",
+ " print(f\" {timestamp}:\")\n",
+ " print(f\" Allocated: {mem['allocated']:.2f} MB\")\n",
+ " print(f\" Reserved: {mem['reserved']:.2f} MB\")\n",
+ " print(f\" Max Allocated: {mem['max_allocated']:.2f} MB\")\n",
+ " \n",
+ " print(\"\\nPerformance Metrics:\")\n",
+ " if self.training_loss is not None:\n",
+ " print(f\" Training Loss: {self.training_loss:.4f}\")\n",
+ " if self.eval_loss is not None:\n",
+ " print(f\" Evaluation Loss: {self.eval_loss:.4f}\")\n",
+ " print(\"-\" * 50)\n",
+ " \n",
+ " def get_stats(self):\n",
+ " \"\"\"Get all statistics as a dictionary\"\"\"\n",
+ " stats = {\n",
+ " \"peak_allocated\": self.peak[\"allocated\"],\n",
+ " \"peak_reserved\": self.peak[\"reserved\"],\n",
+ " \"peak_max_allocated\": self.peak[\"max_allocated\"],\n",
+ " \"quant_type\": self.quant_type,\n",
+ " \"training_loss\": self.training_loss,\n",
+ " \"eval_loss\": self.eval_loss\n",
+ " }\n",
+ " for timestamp, mem in self.timestamps.items():\n",
+ " stats[f\"{timestamp}_allocated\"] = mem[\"allocated\"]\n",
+ " stats[f\"{timestamp}_reserved\"] = mem[\"reserved\"]\n",
+ " stats[f\"{timestamp}_max_allocated\"] = mem[\"max_allocated\"]\n",
+ " return stats\n",
+ "\n",
+ "def plot_memory_metrics(flow_4bit, flow_8bit):\n",
+ " \"\"\"Plot and compare memory metrics between 4-bit and 8-bit quantization.\"\"\"\n",
+ " try:\n",
+ " import matplotlib.pyplot as plt\n",
+ " import pandas as pd\n",
+ " from matplotlib.ticker import EngFormatter\n",
+ "\n",
+ " # Create figure with multiple subplots\n",
+ " fig, axs = plt.subplots(2, 2, figsize=(16, 12))\n",
+ " fig.suptitle('4-bit vs 8-bit Quantization Comparison', fontsize=16)\n",
+ " \n",
+ " # Colors for consistent plotting\n",
+ " colors_4bit = {'Portland': 'blue', 'Seattle': 'green'}\n",
+ " colors_8bit = {'Portland': 'darkblue', 'Seattle': 'darkgreen'}\n",
+ " markers_4bit = {'Portland': 'o', 'Seattle': 's'}\n",
+ " markers_8bit = {'Portland': '^', 'Seattle': 'D'}\n",
+ " \n",
+ " # Flatten the metric data for plotting\n",
+ " memory_data = []\n",
+ " for quant, flow in [(\"4-bit\", flow_4bit), (\"8-bit\", flow_8bit)]:\n",
+ " stats = flow.all_memory_stats\n",
+ " for collab, rounds_data in stats.items():\n",
+ " for round_name, metrics in rounds_data.items():\n",
+ " round_num = int(round_name.split('_')[1])\n",
+ " row = {\n",
+ " 'Collaborator': collab,\n",
+ " 'Round': round_num,\n",
+ " 'Quantization': quant,\n",
+ " 'Peak Memory (MB)': metrics.get('peak_max_allocated', 0),\n",
+ " 'Training Loss': metrics.get('training_loss', 0),\n",
+ " 'Eval Loss': metrics.get('eval_loss', 0)\n",
+ " }\n",
+ " memory_data.append(row)\n",
+ " \n",
+ " df = pd.DataFrame(memory_data)\n",
+ " \n",
+ " # Plot 1: Peak Memory Usage by Round\n",
+ " axs[0, 0].set_title('Peak Memory Usage by Round')\n",
+ " for quant_type in ['4-bit', '8-bit']:\n",
+ " for collab in df['Collaborator'].unique():\n",
+ " subset = df[(df['Quantization'] == quant_type) & (df['Collaborator'] == collab)]\n",
+ " color = colors_4bit[collab] if quant_type == '4-bit' else colors_8bit[collab]\n",
+ " marker = markers_4bit[collab] if quant_type == '4-bit' else markers_8bit[collab]\n",
+ " axs[0, 0].plot(subset['Round'], subset['Peak Memory (MB)'], \n",
+ " marker=marker, linestyle='-', label=f\"{collab} ({quant_type})\",\n",
+ " color=color)\n",
+ " \n",
+ " axs[0, 0].set_xlabel('Round')\n",
+ " axs[0, 0].set_ylabel('Memory (MB)')\n",
+ " axs[0, 0].legend()\n",
+ " axs[0, 0].grid(True, alpha=0.3)\n",
+ " axs[0, 0].yaxis.set_major_formatter(EngFormatter(unit='B'))\n",
+ " \n",
+ " # Plot 2: Training Loss by Round\n",
+ " axs[0, 1].set_title('Training Loss by Round')\n",
+ " for quant_type in ['4-bit', '8-bit']:\n",
+ " for collab in df['Collaborator'].unique():\n",
+ " subset = df[(df['Quantization'] == quant_type) & (df['Collaborator'] == collab)]\n",
+ " color = colors_4bit[collab] if quant_type == '4-bit' else colors_8bit[collab]\n",
+ " marker = markers_4bit[collab] if quant_type == '4-bit' else markers_8bit[collab]\n",
+ " axs[0, 1].plot(subset['Round'], subset['Training Loss'], \n",
+ " marker=marker, linestyle='-', label=f\"{collab} ({quant_type})\",\n",
+ " color=color)\n",
+ " \n",
+ " axs[0, 1].set_xlabel('Round')\n",
+ " axs[0, 1].set_ylabel('Loss')\n",
+ " axs[0, 1].legend()\n",
+ " axs[0, 1].grid(True, alpha=0.3)\n",
+ " \n",
+ " # Plot 3: Eval Loss by Round\n",
+ " axs[1, 0].set_title('Evaluation Loss by Round')\n",
+ " for quant_type in ['4-bit', '8-bit']:\n",
+ " for collab in df['Collaborator'].unique():\n",
+ " subset = df[(df['Quantization'] == quant_type) & (df['Collaborator'] == collab)]\n",
+ " color = colors_4bit[collab] if quant_type == '4-bit' else colors_8bit[collab]\n",
+ " marker = markers_4bit[collab] if quant_type == '4-bit' else markers_8bit[collab]\n",
+ " axs[1, 0].plot(subset['Round'], subset['Eval Loss'], \n",
+ " marker=marker, linestyle='-', label=f\"{collab} ({quant_type})\",\n",
+ " color=color)\n",
+ " \n",
+ " axs[1, 0].set_xlabel('Round')\n",
+ " axs[1, 0].set_ylabel('Loss')\n",
+ " axs[1, 0].legend()\n",
+ " axs[1, 0].grid(True, alpha=0.3)\n",
+ " \n",
+ " # Plot 4: Memory vs Loss (bubble chart)\n",
+ " axs[1, 1].set_title('Memory Usage vs. Evaluation Loss')\n",
+ " for quant_type in ['4-bit', '8-bit']:\n",
+ " for collab in df['Collaborator'].unique():\n",
+ " subset = df[(df['Quantization'] == quant_type) & (df['Collaborator'] == collab)]\n",
+ " color = colors_4bit[collab] if quant_type == '4-bit' else colors_8bit[collab]\n",
+ " marker = markers_4bit[collab] if quant_type == '4-bit' else markers_8bit[collab]\n",
+ " \n",
+ " # Size proportional to round number for visual differentiation\n",
+ " sizes = [100 * (r+1) for r in subset['Round']]\n",
+ " \n",
+ " axs[1, 1].scatter(subset['Peak Memory (MB)'], subset['Eval Loss'],\n",
+ " s=sizes, alpha=0.7, \n",
+ " label=f\"{collab} ({quant_type})\",\n",
+ " color=color, marker=marker)\n",
+ " \n",
+ " # Add round number annotations\n",
+ " for _, row in subset.iterrows():\n",
+ " axs[1, 1].annotate(f\"R{int(row['Round'])}\", \n",
+ " (row['Peak Memory (MB)'], row['Eval Loss']),\n",
+ " xytext=(5, 5), textcoords='offset points')\n",
+ " \n",
+ " axs[1, 1].set_xlabel('Peak Memory (MB)')\n",
+ " axs[1, 1].set_ylabel('Evaluation Loss')\n",
+ " axs[1, 1].legend()\n",
+ " axs[1, 1].grid(True, alpha=0.3)\n",
+ " axs[1, 1].xaxis.set_major_formatter(EngFormatter(unit='B'))\n",
+ " \n",
+ " plt.tight_layout()\n",
+ " plt.subplots_adjust(top=0.92)\n",
+ " plt.show()\n",
+ " \n",
+ " # Print summary comparison\n",
+ " print(\"\\n==== Performance Summary ====\\n\")\n",
+ " # Group by quantization and compute means\n",
+ " summary = df.groupby('Quantization').agg({\n",
+ " 'Peak Memory (MB)': 'mean',\n",
+ " 'Training Loss': 'mean', \n",
+ " 'Eval Loss': 'mean'\n",
+ " }).reset_index()\n",
+ " \n",
+ " # Calculate percentage difference\n",
+ " mem_diff_pct = ((summary.loc[1, 'Peak Memory (MB)'] - summary.loc[0, 'Peak Memory (MB)']) / \n",
+ " summary.loc[0, 'Peak Memory (MB)'] * 100)\n",
+ " \n",
+ " eval_diff_pct = ((summary.loc[1, 'Eval Loss'] - summary.loc[0, 'Eval Loss']) / \n",
+ " summary.loc[0, 'Eval Loss'] * 100)\n",
+ " \n",
+ " print(f\"Memory Usage Comparison:\")\n",
+ " print(f\" 4-bit Avg: {summary.loc[0, 'Peak Memory (MB)']:.2f} MB\")\n",
+ " print(f\" 8-bit Avg: {summary.loc[1, 'Peak Memory (MB)']:.2f} MB\")\n",
+ " print(f\" Difference: {abs(mem_diff_pct):.1f}% {'more' if mem_diff_pct > 0 else 'less'} memory with 8-bit\")\n",
+ " \n",
+ " print(f\"\\nEvaluation Loss Comparison:\")\n",
+ " print(f\" 4-bit Avg: {summary.loc[0, 'Eval Loss']:.4f}\")\n",
+ " print(f\" 8-bit Avg: {summary.loc[1, 'Eval Loss']:.4f}\")\n",
+ " print(f\" Difference: {abs(eval_diff_pct):.1f}% {'higher' if eval_diff_pct > 0 else 'lower'} loss with 8-bit\")\n",
+ " \n",
+ " loss_efficiency = ((summary.loc[0, 'Eval Loss'] - summary.loc[1, 'Eval Loss']) / \n",
+ " (summary.loc[0, 'Peak Memory (MB)'] - summary.loc[1, 'Peak Memory (MB)']))\n",
+ " \n",
+ " if loss_efficiency > 0:\n",
+ " efficiency_msg = \"8-bit provides better memory efficiency with lower loss\"\n",
+ " else:\n",
+ " efficiency_msg = \"4-bit provides better memory efficiency with lower loss\"\n",
+ " \n",
+ " print(f\"\\nEfficiency Analysis: {efficiency_msg}\")\n",
+ " except ImportError:\n",
+ " print(\"Plotting requires matplotlib and pandas. Install with: pip install matplotlib pandas\")\n",
+ " except Exception as e:\n",
+ " print(f\"Error plotting metrics: {str(e)}\")\n",
+ "\n",
+ "class FederatedFlow(FLSpec):\n",
+ " def __init__(self, model=None, optimizer=None, rounds=3, quant_type=\"4bit\", **kwargs):\n",
+ " \"\"\"\n",
+ " Initialize the class with the given model, optimizer, and number of rounds.\n",
+ "\n",
+ " Parameters:\n",
+ " model (torch.nn.Module, optional): The model to be used. If None, a ValueError is raised.\n",
+ " optimizer (torch.optim.Optimizer, optional): The optimizer to be used.\n",
+ " rounds (int, optional): The number of rounds for training or processing (default is 3).\n",
+ " quant_type (str, optional): Quantization type, either \"4bit\" or \"8bit\".\n",
+ " **kwargs: Additional keyword arguments to be passed to the superclass initializer.\n",
+ "\n",
+ " Raises:\n",
+ " ValueError: If no model is provided.\n",
+ " \"\"\"\n",
+ " super().__init__(**kwargs)\n",
+ " if model is not None:\n",
+ " self.model = model\n",
+ " self.peft_params = get_peft_model_state_dict(self.model)\n",
+ " self.optimizer = optimizer\n",
+ " else:\n",
+ " raise ValueError(\"No model inputted\")\n",
+ "\n",
+ " self.rounds = rounds\n",
+ " self.quant_type = quant_type\n",
+ " # Initialize histories for tracking metrics over rounds\n",
+ " self.average_loss_history = []\n",
+ " self.agg_model_loss_history = []\n",
+ " self.local_model_loss_history = []\n",
+ " \n",
+ "\n",
+ " @aggregator\n",
+ " def start(self):\n",
+ " \"\"\"\n",
+ " Initialize the model and set up the collaborators for federated learning.\n",
+ "\n",
+ " This method performs the initial setup for the model, including setting the\n",
+ " collaborators, initializing private variables, and starting the first round\n",
+ " of the federated learning process.\n",
+ " \"\"\"\n",
+ " print(f\"Performing initialization for model with {self.quant_type} quantization\")\n",
+ " self.collaborators = self.runtime.collaborators\n",
+ " self.current_round = 0\n",
+ " # Initialize dictionary to collect memory stats\n",
+ " # Check if collaborators are objects with name attribute or strings\n",
+ " if hasattr(self.collaborators[0], 'name'):\n",
+ " collab_names = [c.name for c in self.collaborators]\n",
+ " else:\n",
+ " # If collaborators are already strings, use them directly\n",
+ " collab_names = self.collaborators\n",
+ " self.all_memory_stats = {collab: {} for collab in collab_names}\n",
+ " self.next(\n",
+ " self.aggregated_model_validation,\n",
+ " foreach=\"collaborators\",\n",
+ " )\n",
+ "\n",
+ " \n",
+ " @collaborator\n",
+ " def aggregated_model_validation(self):\n",
+ " \"\"\"\n",
+ " Perform aggregated model validation for a collaborator.\n",
+ "\n",
+ " This method loads the model, applies the PEFT configuration, and evaluates\n",
+ " the model using the provided training and evaluation datasets. The validation\n",
+ " score is then stored and the next step in the process is triggered.\n",
+ " \"\"\"\n",
+ " print(f\"Performing aggregated model validation for collaborator {self.input} with {self.quant_type}\")\n",
+ " # Initialize memory tracker for this collaborator\n",
+ " self.memory_tracker = MemoryTracker(self.input, self.quant_type)\n",
+ " self.memory_tracker.reset_peak()\n",
+ " \n",
+ " # Choose quantization config based on quant_type\n",
+ " if self.quant_type == \"4bit\":\n",
+ " quant_config = bnb_config_4bit\n",
+ " else: # 8bit\n",
+ " quant_config = bnb_config_8bit\n",
+ " \n",
+ " # Define device_map variable\n",
+ " #device_map = \"auto\"\n",
+ " device_map = {\"\": torch.cuda.current_device()} if torch.cuda.is_available() else \"cpu\"\n",
+ " try:\n",
+ " self.model = AutoModelForCausalLM.from_pretrained(\n",
+ " model_name,\n",
+ " quantization_config=quant_config,\n",
+ " device_map=device_map,\n",
+ " trust_remote_code=True\n",
+ " )\n",
+ " self.memory_tracker.log(\"model_load\")\n",
+ " except ValueError:\n",
+ " # Fallback to CPU if GPU memory is insufficient\n",
+ " print(f\"Falling back to CPU mode for {self.input}\")\n",
+ " self.model = AutoModelForCausalLM.from_pretrained(\n",
+ " model_name,\n",
+ " device_map=\"cpu\",\n",
+ " trust_remote_code=True\n",
+ " )\n",
+ " self.memory_tracker.log(\"model_load\")\n",
+ " \n",
+ " self.model = prepare_model_for_kbit_training(self.model)\n",
+ " self.model = get_peft_model(self.model, peft_config)\n",
+ " set_peft_model_state_dict(self.model, self.peft_params)\n",
+ " \n",
+ " trainer = SFTTrainer(\n",
+ " model=self.model,\n",
+ " args=training_args,\n",
+ " peft_config=peft_config,\n",
+ " train_dataset=self.train_dataset,\n",
+ " eval_dataset=self.eval_dataset,\n",
+ " max_seq_length=1024,\n",
+ " dataset_text_field=\"text\",\n",
+ " tokenizer=tokenizer,\n",
+ " packing=True,\n",
+ " data_collator=transformers.DataCollatorForSeq2Seq(\n",
+ " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n",
+ " ),\n",
+ " )\n",
+ "\n",
+ " trainer.remove_callback(PrinterCallback)\n",
+ " out = trainer.evaluate()\n",
+ " self.agg_validation_score = out[\"eval_loss\"]\n",
+ " print(f\"{self.input} value of {self.agg_validation_score}\")\n",
+ " self.memory_tracker.log_loss(eval_loss=self.agg_validation_score) # Log eval loss\n",
+ " self.memory_tracker.update_peak()\n",
+ " self.next(self.train)\n",
+ "\n",
+ " @collaborator\n",
+ " def train(self):\n",
+ " \"\"\"\n",
+ " Train the model for a collaborator.\n",
+ "\n",
+ " This method trains the model using the provided training and evaluation datasets.\n",
+ " The training loss is stored, the model is saved, and the next step in the process\n",
+ " is triggered.\n",
+ " \"\"\"\n",
+ " self.memory_tracker.log(\"before_training\")\n",
+ " trainer = SFTTrainer(\n",
+ " model=self.model,\n",
+ " args=training_args,\n",
+ " peft_config=peft_config,\n",
+ " train_dataset=self.train_dataset,\n",
+ " eval_dataset=self.eval_dataset,\n",
+ " max_seq_length=1024,\n",
+ " dataset_text_field=\"text\",\n",
+ " tokenizer=tokenizer,\n",
+ " packing=True,\n",
+ " data_collator=transformers.DataCollatorForSeq2Seq(\n",
+ " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n",
+ " ),\n",
+ " )\n",
+ "\n",
+ " out = trainer.train()\n",
+ " self.loss = out.training_loss\n",
+ " self.memory_tracker.log(\"after_training\")\n",
+ " self.memory_tracker.log_loss(training_loss=self.loss) # Log training loss\n",
+ " self.memory_tracker.update_peak()\n",
+ " trainer.save_model()\n",
+ " self.training_completed = True\n",
+ " self.next(self.local_model_validation)\n",
+ "\n",
+ " @collaborator\n",
+ " def local_model_validation(self):\n",
+ " \"\"\"\n",
+ " Perform local model validation for a collaborator.\n",
+ "\n",
+ " This method evaluates the model using the provided training and evaluation datasets.\n",
+ " The validation score is stored, the PEFT parameters are updated, and the next step\n",
+ " in the process is triggered.\n",
+ " \"\"\"\n",
+ " trainer = SFTTrainer(\n",
+ " model=self.model,\n",
+ " args=training_args,\n",
+ " peft_config=peft_config,\n",
+ " train_dataset=self.train_dataset,\n",
+ " eval_dataset=self.eval_dataset,\n",
+ " max_seq_length=1024,\n",
+ " dataset_text_field=\"text\",\n",
+ " tokenizer=tokenizer,\n",
+ " packing=True,\n",
+ " data_collator=transformers.DataCollatorForSeq2Seq(\n",
+ " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n",
+ " ),\n",
+ " )\n",
+ " out = trainer.evaluate()\n",
+ " self.local_validation_score = out[\"eval_loss\"]\n",
+ " self.memory_tracker.log_loss(eval_loss=self.local_validation_score) # Log eval loss\n",
+ " self.peft_params = get_peft_model_state_dict(self.model)\n",
+ " print(f\"Doing local model validation for collaborator {self.input}\")\n",
+ " \n",
+ " # Display memory report for this collaborator\n",
+ " self.memory_tracker.report()\n",
+ " self.memory_stats = self.memory_tracker.get_stats()\n",
+ " self.next(self.join, exclude=[\"training_completed\", \"model\", \"memory_tracker\"])\n",
+ "\n",
+ " @aggregator\n",
+ " def join(self, inputs):\n",
+ " \"\"\"\n",
+ " Aggregate the results from all collaborators and update the model.\n",
+ "\n",
+ " This method calculates the average loss, aggregated model accuracy, and local model\n",
+ " accuracy from all collaborators. The model parameters are updated using Federated\n",
+ " Averaging (FedAvg), and the next round of the process is triggered if applicable.\n",
+ " \"\"\"\n",
+ " self.average_loss = sum(input.loss for input in inputs) / len(inputs)\n",
+ " self.aggregated_model_accuracy = sum(\n",
+ " input.agg_validation_score for input in inputs\n",
+ " ) / len(inputs)\n",
+ " self.local_model_accuracy = sum(\n",
+ " input.local_validation_score for input in inputs\n",
+ " ) / len(inputs)\n",
+ " print(\n",
+ " f\"Average aggregated model validation values = {self.aggregated_model_accuracy}\"\n",
+ " )\n",
+ " print(f\"Average training loss = {self.average_loss}\")\n",
+ " print(f\"Average local model validation values = {self.local_model_accuracy}\")\n",
+ "\n",
+ " # Store metrics in history for plotting trends\n",
+ " self.average_loss_history.append(self.average_loss)\n",
+ " self.agg_model_loss_history.append(self.aggregated_model_accuracy)\n",
+ " self.local_model_loss_history.append(self.local_model_accuracy)\n",
+ " \n",
+ " # Collect memory stats from all collaborators for this round\n",
+ " for input_data in inputs:\n",
+ " self.all_memory_stats[input_data.input][f\"round_{self.current_round}\"] = input_data.memory_stats\n",
+ "\n",
+ " self.model = FedAvg([input.peft_params for input in inputs], self.model)\n",
+ " self.peft_params = get_peft_model_state_dict(self.model)\n",
+ "\n",
+ " self.model.save_pretrained(\"./aggregated/model\")\n",
+ " tokenizer.save_pretrained(\"./aggregated/tokenizer\")\n",
+ " self.current_round += 1\n",
+ " if self.current_round < self.rounds:\n",
+ " self.next(\n",
+ " self.aggregated_model_validation,\n",
+ " foreach=\"collaborators\",\n",
+ " exclude=[\"model\"],\n",
+ " )\n",
+ " else:\n",
+ " self.next(self.end)\n",
+ "\n",
+ " @aggregator\n",
+ " def end(self):\n",
+ " \"\"\"\n",
+ " End the federated learning process.\n",
+ "\n",
+ " This method marks the end of the federated learning process and performs any\n",
+ " necessary cleanup or finalization steps.\n",
+ " \"\"\"\n",
+ " print(f\"This is the end of the flow for {self.quant_type} quantization\")\n",
+ " print(\"\\n===== Final Metrics =====\\n\")\n",
+ " print(f\"Average Training Loss: {self.average_loss_history[-1]:.4f}\")\n",
+ " print(f\"Final Aggregated Model Loss: {self.agg_model_loss_history[-1]:.4f}\")\n",
+ " print(f\"Final Local Model Loss: {self.local_model_loss_history[-1]:.4f}\")\n",
+ " \n",
+ " print(\"\\n===== Memory Usage Summary Across All Rounds =====\\n\")\n",
+ " \n",
+ " # Print aggregated memory statistics\n",
+ " for collab, rounds_data in self.all_memory_stats.items():\n",
+ " print(f\"\\n==== {collab} Memory Usage Across Rounds ({self.quant_type}) ====\\n\")\n",
+ " for round_name, stats in rounds_data.items():\n",
+ " print(f\" {round_name}:\")\n",
+ " for metric, value in stats.items():\n",
+ " if value is not None:\n",
+ " if metric in ['training_loss', 'eval_loss', 'quant_type']:\n",
+ " if metric != 'quant_type':\n",
+ " print(f\" {metric}: {value:.4f}\")\n",
+ " else:\n",
+ " print(f\" {metric}: {value:.2f} MB\")\n",
+ " else:\n",
+ " print(f\" {metric}: Not recorded\")\n",
+ " print(\"-\" * 50)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7bc8fe27",
+ "metadata": {},
+ "source": [
+ "## Run Federated Learning with 4-bit and 8-bit Quantization"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "38894111-41d9-4dd4-b1c8-eb7ec3cdd3e1",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "=============== Running with 4-bit Quantization ===============\n",
+ "\n",
+ "\n",
+ "Calling start\n",
+ "\u001b[94mPerforming initialization for model with 4bit quantization\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94mPerforming aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.27it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "Generating train split: 915 examples [00:01, 626.55 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n",
+ "Generating train split: 96 examples [00:00, 570.74 examples/s]\u001b[0m\u001b[94m0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [12/12 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mPortland value of 0.5819987058639526\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "Generating train split: 915 examples [00:01, 619.44 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n",
+ "Generating train split: 96 examples [00:00, 628.10 examples/s]\u001b[0m\u001b[94m0m\u001b[94m\n",
+ "\u001b[0m`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [12/12 00:12]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 35323.02 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 60420.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57461.62 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 32979.33 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 45302.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 48011.95 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 35078.37 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 60420.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57461.62 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 35323.02 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 41244.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57461.62 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: 0.3295\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.3029\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94mPerforming aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "Generating train split: 917 examples [00:01, 622.35 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n",
+ "Generating train split: 100 examples [00:00, 631.84 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mSeattle value of 0.5914124846458435\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 23170.02 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 48256.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 20890.32 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 33280.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 22957.36 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 48256.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 23170.02 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 29538.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: 0.3287\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.3178\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling join\n",
+ "\u001b[94mAverage aggregated model validation values = 0.5867055952548981\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mAverage training loss = 0.3290792714039832\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mAverage local model validation values = 0.310357466340065\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94mPerforming aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.28it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [12/12 00:12]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mPortland value of 0.30139902234077454\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [12/12 00:12]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 35535.52 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 59824.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33255.83 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 33440.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 35322.87 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 59824.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 35535.52 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 41948.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: 0.2949\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.2986\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94mPerforming aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.27it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mSeattle value of 0.3157660961151123\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 35535.52 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 59804.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57918.78 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33255.83 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 33424.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57918.78 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 35322.87 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 59804.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57918.78 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 35535.52 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 41528.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57918.78 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: 0.2942\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.3126\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling join\n",
+ "\u001b[94mAverage aggregated model validation values = 0.3085825592279434\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mAverage training loss = 0.29453011579388616\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mAverage local model validation values = 0.30557093024253845\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling end\n",
+ "\u001b[94mThis is the end of the flow for 4bit quantization\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "===== Final Metrics =====\n",
+ "\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mAverage Training Loss: 0.2945\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mFinal Aggregated Model Loss: 0.3086\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mFinal Local Model Loss: 0.3056\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "===== Memory Usage Summary Across All Rounds =====\n",
+ "\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Portland Memory Usage Across Rounds (4bit) ====\n",
+ "\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m round_0:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m peak_allocated: 35323.02 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m peak_reserved: 60420.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m peak_max_allocated: 57461.62 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m training_loss: 0.3295\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m eval_loss: 0.3029\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load_allocated: 32979.33 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load_reserved: 45302.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load_max_allocated: 48011.95 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training_allocated: 35078.37 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training_reserved: 60420.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training_max_allocated: 57461.62 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training_allocated: 35323.02 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training_reserved: 41244.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training_max_allocated: 57461.62 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m round_1:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m peak_allocated: 35535.52 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m peak_reserved: 59824.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m peak_max_allocated: 57706.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m training_loss: 0.2949\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m eval_loss: 0.2986\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load_allocated: 33255.83 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load_reserved: 33440.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load_max_allocated: 57706.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training_allocated: 35322.87 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training_reserved: 59824.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training_max_allocated: 57706.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training_allocated: 35535.52 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training_reserved: 41948.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training_max_allocated: 57706.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Seattle Memory Usage Across Rounds (4bit) ====\n",
+ "\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m round_0:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m peak_allocated: 23170.02 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m peak_reserved: 48256.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m peak_max_allocated: 57706.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m training_loss: 0.3287\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m eval_loss: 0.3178\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load_allocated: 20890.32 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load_reserved: 33280.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load_max_allocated: 57706.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training_allocated: 22957.36 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training_reserved: 48256.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training_max_allocated: 57706.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training_allocated: 23170.02 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training_reserved: 29538.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training_max_allocated: 57706.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m round_1:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m peak_allocated: 35535.52 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m peak_reserved: 59804.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m peak_max_allocated: 57918.78 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m training_loss: 0.2942\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m eval_loss: 0.3126\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load_allocated: 33255.83 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load_reserved: 33424.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load_max_allocated: 57918.78 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training_allocated: 35322.87 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training_reserved: 59804.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training_max_allocated: 57918.78 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training_allocated: 35535.52 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training_reserved: 41528.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training_max_allocated: 57918.78 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mCleared CUDA cache between runs\n",
+ "\n",
+ "=============== Running with 8-bit Quantization ===============\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|██████████| 6/6 [00:04<00:00, 1.36it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling start\n",
+ "\u001b[94mPerforming initialization for model with 8bit quantization\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94mPerforming aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.30it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [12/12 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mPortland value of 0.5662918090820312\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [12/12 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 75278.13 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 93310.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 92214.55 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 72811.06 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 91120.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 91914.16 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 75278.13 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 93310.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 92214.55 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 75250.57 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 83158.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 92214.55 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: 0.3243\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.2989\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94mPerforming aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.28it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mSeattle value of 0.5757399201393127\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 58276.76 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 83620.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 92551.30 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 55775.01 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 74152.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 92551.30 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 58145.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 83620.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 92551.30 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 58276.76 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 65726.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 92551.30 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: 0.3242\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.3134\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling join\n",
+ "\u001b[94mAverage aggregated model validation values = 0.571015864610672\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mAverage training loss = 0.3242545044578319\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mAverage local model validation values = 0.30610978603363037\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94mPerforming aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.28it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [12/12 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mPortland value of 0.296934574842453\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [12/12 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 75488.79 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 93552.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 92552.04 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 73023.56 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 74272.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 92551.30 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 75488.79 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 93552.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 92552.04 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 75466.38 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 82944.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 92552.04 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: 0.2914\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.2939\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94mPerforming aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.28it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mSeattle value of 0.31116044521331787\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Plot aggregated metrics comparison\n",
+ "print(\"\\n==== Aggregated Performance Metrics: 4-bit vs 8-bit ====\\n\")\n",
+ "plot_aggregated_metrics(flflow_4bit, flflow_8bit)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4c089e09",
+ "metadata": {},
+ "source": [
+ "### Memory Efficiency Comparison\n",
+ "\n",
+ "- **4-bit Quantization**: Uses less memory overall, allowing for larger batch sizes or model sizes on the same hardware.\n",
+ "- **8-bit Quantization**: Requires more memory but still offers significant savings compared to full precision (FP16/FP32).\n",
+ "- **Peak Memory Usage**: The difference in peak memory consumption shows the trade-off between precision and memory requirements.\n",
+ "\n",
+ "### Training Performance Comparison\n",
+ "\n",
+ "- **Training Loss**: 8-bit quantization typically maintains closer fidelity to the original model, potentially leading to slightly better training convergence.\n",
+ "- **Evaluation Loss**: The evaluation metrics help determine if the higher precision of 8-bit quantization translates to better model performance.\n",
+ "\n",
+ "### Use Case Recommendations\n",
+ "\n",
+ "- **Resource-constrained environments**: 4-bit quantization provides better memory efficiency for edge devices or limited GPU resources.\n",
+ "- **Higher precision needs**: If model accuracy is critical and resources permit, 8-bit quantization offers a good balance between performance and efficiency.\n",
+ "- **Federated Learning Impact**: The quantization choice particularly affects resource utilization across collaborators in the federated setting."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python (myenv)",
+ "language": "python",
+ "name": "myenv"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
From f2a2eedf04a3e8be624e35cb7de2319b58214e4d Mon Sep 17 00:00:00 2001
From: Rajith
Date: Fri, 16 May 2025 15:45:01 +0530
Subject: [PATCH 30/34] removed older code
---
.../LLM/phi-4-with4bit quantization.ipynb | 1772 -----------------
1 file changed, 1772 deletions(-)
delete mode 100644 openfl-tutorials/experimental/workflow/LLM/phi-4-with4bit quantization.ipynb
diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-with4bit quantization.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-with4bit quantization.ipynb
deleted file mode 100644
index e2efa9054b..0000000000
--- a/openfl-tutorials/experimental/workflow/LLM/phi-4-with4bit quantization.ipynb
+++ /dev/null
@@ -1,1772 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "a59f475d-d843-46bc-b75e-10984b687ed3",
- "metadata": {},
- "source": [
- "# Enhanced Federated Fine-Tuning of Phi-4 Using OpenFL with PEFT & Quantization"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "20c74cb9-51a2-42e2-893f-d280e227e8bf",
- "metadata": {},
- "source": [
- "In this tutorial, we demonstrate how to fine-tune Microsoft's Phi-4 model in a federated learning workflow with enhanced local training using:\n",
- "- Parameter-Efficient Fine-Tuning (PEFT)\n",
- "- 4-bit Quantization (QLoRA)\n",
- "- Gradient Checkpointing\n",
- "- Optimized Training Configuration"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "d07c32d3-1a8d-4162-af45-bc3a10e0ae3f",
- "metadata": {},
- "source": [
- "## Installation"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "05b2ad75-8c7b-499c-902e-dbd5b24361bc",
- "metadata": {},
- "outputs": [],
- "source": [
- "!pip install torch transformers peft datasets trl==0.12.2 bitsandbytes accelerate -q"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "a7ae1a7e-8c16-4c5a-be57-33d84723aed7",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Thu May 15 13:27:27 2025 \n",
- "+-----------------------------------------------------------------------------------------+\n",
- "| NVIDIA-SMI 550.144.03 Driver Version: 550.144.03 CUDA Version: 12.4 |\n",
- "|-----------------------------------------+------------------------+----------------------+\n",
- "| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
- "| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n",
- "| | | MIG M. |\n",
- "|=========================================+========================+======================|\n",
- "| 0 NVIDIA H100 NVL Off | 00000001:00:00.0 Off | 0 |\n",
- "| N/A 39C P0 62W / 400W | 1MiB / 95830MiB | 0% Default |\n",
- "| | | Disabled |\n",
- "+-----------------------------------------+------------------------+----------------------+\n",
- " \n",
- "+-----------------------------------------------------------------------------------------+\n",
- "| Processes: |\n",
- "| GPU GI CI PID Type Process name GPU Memory |\n",
- "| ID ID Usage |\n",
- "|=========================================================================================|\n",
- "| No running processes found |\n",
- "+-----------------------------------------------------------------------------------------+\n"
- ]
- }
- ],
- "source": [
- "!nvidia-smi"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "440a9c39-ec42-45a5-80f6-9a9e0bc90d2f",
- "metadata": {},
- "source": [
- "## Import Libraries"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "be4690ae-0671-4d3a-8f21-620ab865a03e",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/home/azureuser/env_name/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
- " from .autonotebook import tqdm as notebook_tqdm\n",
- "2025-05-15 13:27:30,648\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n"
- ]
- }
- ],
- "source": [
- "# System imports\n",
- "import os\n",
- "import numpy as np\n",
- "\n",
- "# PyTorch imports\n",
- "import torch\n",
- "\n",
- "# Hugging Face Transformers imports for model loading and training\n",
- "from transformers import (\n",
- " AutoModelForCausalLM, # For loading large language models\n",
- " AutoTokenizer, # For tokenizing text inputs\n",
- " BitsAndBytesConfig, # For 4-bit quantization configuration\n",
- " TrainingArguments # For configuring training hyperparameters\n",
- ")\n",
- "\n",
- "# PEFT (Parameter-Efficient Fine-Tuning) imports\n",
- "from peft import (\n",
- " LoraConfig, # For configuring Low-Rank Adaptation\n",
- " get_peft_model, # For applying PEFT to a model\n",
- " prepare_model_for_kbit_training, # For preparing quantized models for training\n",
- " PeftModel # Base class for PEFT models\n",
- ")\n",
- "from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict # For state dict manipulation\n",
- "\n",
- "# Dataset and training imports\n",
- "from datasets import load_dataset\n",
- "from trl import SFTTrainer # Supervised Fine-Tuning Trainer\n",
- "\n",
- "# OpenFL imports for federated learning\n",
- "from openfl.experimental.workflow.interface import Aggregator, Collaborator, FLSpec\n",
- "from openfl.experimental.workflow.placement import aggregator, collaborator\n",
- "from openfl.experimental.workflow.runtime import LocalRuntime"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "06274755",
- "metadata": {},
- "source": [
- "## Acquiring and preprocessing dataset"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "a6edefa4",
- "metadata": {},
- "source": [
- "We can download the dataset directly from the [LLM-Adapters repository](https://github.com/AGI-Edgerunners/LLM-Adapters)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "962ac825",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Import libraries needed for downloading and verifying the dataset\n",
- "import hashlib\n",
- "import requests\n",
- "\n",
- "def file_checksum(file_path, algorithm=\"sha256\"):\n",
- " \"\"\"\n",
- " Calculate the checksum of a file using the specified hashing algorithm.\n",
- " \n",
- " Args:\n",
- " file_path (str): The path to the file for which the checksum is to be calculated.\n",
- " algorithm (str): The hashing algorithm to use (default is 'sha256').\n",
- " \n",
- " Returns:\n",
- " str: The calculated checksum of the file.\n",
- " \"\"\"\n",
- " hash_func = hashlib.new(algorithm)\n",
- " with open(file_path, \"rb\") as f:\n",
- " for chunk in iter(lambda: f.read(4096), b\"\"):\n",
- " hash_func.update(chunk)\n",
- " return hash_func.hexdigest()\n",
- "\n",
- "\n",
- "# Download the dataset if it doesn't exist locally\n",
- "if not os.path.exists(\"math_10k.json\"):\n",
- " print(\"Downloading math_10k.json dataset...\")\n",
- " r = requests.get(\n",
- " \"https://raw.githubusercontent.com/AGI-Edgerunners/LLM-Adapters/main/ft-training_set/math_10k.json\",\n",
- " )\n",
- " with open(\n",
- " \"math_10k.json\",\n",
- " \"wb\",\n",
- " ) as f:\n",
- " f.write(r.content)\n",
- " print(\"Download complete.\")\n",
- "\n",
- " # Verify the integrity of the downloaded file\n",
- " actual_checksum = file_checksum(\"math_10k.json\")\n",
- " expected_checksum = \"0342d0d860ad8592b579329337c90e42eefd3d9f2898043140cbd120630418b8\"\n",
- " if actual_checksum != expected_checksum:\n",
- " raise ValueError(\n",
- " \"Checksum verification failed. The file may have been altered.\"\n",
- " )\n",
- " print(\"Checksum verification successful.\")\n",
- "else:\n",
- " print(\"Dataset already exists locally.\")\n",
- "\n",
- "# Set the dataset path to be used later\n",
- "dataset_name = \"math_10k.json\""
- ]
- },
- {
- "cell_type": "markdown",
- "id": "08576aa0-f628-4ae6-8fc3-dd167d164784",
- "metadata": {},
- "source": [
- "## Configuration"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "eada9809-468a-47c6-9b03-55aa887c9487",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Model and dataset configuration\n",
- "model_name = \"microsoft/phi-4\" # Pre-trained model identifier from Hugging Face Hub\n",
- "#dataset_name = \"math_10k.json\" # Dataset file containing mathematical QA pairs\n",
- "\n",
- "# QLoRA (Quantized Low-Rank Adaptation) configuration for 4-bit quantization\n",
- "# This reduces memory footprint while maintaining model quality\n",
- "bnb_config = BitsAndBytesConfig(\n",
- " load_in_4bit=True, # Enable 4-bit quantization\n",
- " bnb_4bit_quant_type=\"nf4\", # Use normalized float 4 format for better precision\n",
- " bnb_4bit_compute_dtype=torch.bfloat16, # Computation precision\n",
- " bnb_4bit_use_double_quant=False, # Disable nested quantization for simplicity\n",
- ")\n",
- "\n",
- "# LoRA (Low-Rank Adaptation) configuration for parameter-efficient fine-tuning\n",
- "# This allows fine-tuning with significantly fewer parameters\n",
- "peft_config = LoraConfig(\n",
- " r=8, # Rank of the update matrices (higher = more capacity but more parameters)\n",
- " lora_alpha=16, # Scaling factor for the trained weights\n",
- " lora_dropout=0.01, # Dropout probability for LoRA layers\n",
- " bias=\"none\", # Don't train bias parameters to reduce memory\n",
- " task_type=\"CAUSAL_LM\", # Specify causal language modeling task\n",
- " target_modules=\"all-linear\", # Apply LoRA to all linear layers\n",
- ")\n",
- "\n",
- "# Training hyperparameters configuration\n",
- "training_args = TrainingArguments(\n",
- " output_dir=\"./results\", # Directory to save checkpoints and logs\n",
- " num_train_epochs=1, # Number of training epochs\n",
- " per_device_train_batch_size=2, # Batch size per GPU/TPU core\n",
- " gradient_accumulation_steps=2, # Number of updates steps to accumulate before backward pass\n",
- " optim=\"adamw_torch_fused\", # Optimizer to use (fused for better performance)\n",
- " save_steps=100, # Save checkpoint every X updates steps\n",
- " logging_steps=10, # Log metrics every X updates steps\n",
- " learning_rate=3e-4, # Initial learning rate\n",
- " weight_decay=0.001, # Weight decay regularization\n",
- " fp16=False, # Disable FP16 training (using BF16 instead)\n",
- " bf16=True, # Enable BF16 training (better numerical stability than FP16)\n",
- " max_grad_norm=0.5, # Max gradient norm for gradient clipping\n",
- " warmup_ratio=0.02, # Portion of steps for learning rate warmup\n",
- " lr_scheduler_type=\"cosine\", # Learning rate scheduler type\n",
- " gradient_checkpointing=True, # Enable gradient checkpointing to save memory\n",
- " report_to=\"none\" # Disable reporting to tracking platforms\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "ffe93234-2a1a-4809-a431-efe2f35ce496",
- "metadata": {},
- "source": [
- "## Load and Prepare Model"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "8ab371f1-64c3-4225-82e7-fb3c5b05578c",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.36it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "trainable params: 27,852,800 || all params: 14,687,360,000 || trainable%: 0.1896\n"
- ]
- }
- ],
- "source": [
- "# Load tokenizer\n",
- "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
- "tokenizer.pad_token = tokenizer.eos_token\n",
- "tokenizer.padding_side = \"right\"\n",
- "\n",
- "# Load model with quantization\n",
- "model = AutoModelForCausalLM.from_pretrained(\n",
- " model_name,\n",
- " quantization_config=bnb_config,\n",
- " device_map=\"auto\",\n",
- " trust_remote_code=True\n",
- ")\n",
- "\n",
- "# Prepare model for k-bit training\n",
- "model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)\n",
- "\n",
- "# Apply LoRA\n",
- "model = get_peft_model(model, peft_config)\n",
- "model.print_trainable_parameters()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "dd058fff-f6dd-4cc6-acaf-7e2fa2c1132d",
- "metadata": {},
- "source": [
- "## Load and Prepare Dataset"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "4392ddab-10b7-41f6-a8e0-65ba298ea457",
- "metadata": {},
- "outputs": [],
- "source": [
- "def format_prompt(example):\n",
- " \"\"\"\n",
- " Format a dataset example into a standardized prompt-response format for instruction tuning.\n",
- " \n",
- " This function converts raw dataset examples into a structured format suitable for\n",
- " instruction fine-tuning of large language models. The format follows the common\n",
- " pattern used for instruction-following tasks with clear section demarcation.\n",
- " \n",
- " Args:\n",
- " example (dict): A dictionary containing the example data with keys:\n",
- " - 'instruction': The task instruction\n",
- " - 'input': The optional input context (may be empty)\n",
- " - 'output': The expected output/response\n",
- " \n",
- " Returns:\n",
- " str: A formatted prompt string with instruction, optional input, and response\n",
- " \"\"\"\n",
- " if example[\"input\"]:\n",
- " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n",
- "\n",
- "### Instruction:\n",
- "{example['instruction']}\n",
- "\n",
- "### Input:\n",
- "{example['input']}\n",
- "\n",
- "### Response:\n",
- "{example['output']}\"\"\"\n",
- " else:\n",
- " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n",
- "\n",
- "### Instruction:\n",
- "{example['instruction']}\n",
- "\n",
- "### Response:\n",
- "{example['output']}\"\"\"\n",
- "\n",
- "# Load dataset from JSON file (contains mathematical question-answer pairs)\n",
- "dataset = load_dataset(\"json\", data_files=dataset_name, split=\"train\", num_proc=4)\n",
- "\n",
- "# Transform raw examples into formatted text for instruction tuning\n",
- "dataset = dataset.map(lambda x: {\"text\": format_prompt(x)}, num_proc=4)\n",
- "\n",
- "# Split dataset into training (90%) and evaluation (10%) sets\n",
- "dataset = dataset.train_test_split(test_size=0.1)\n",
- "train_dataset = dataset[\"train\"]\n",
- "eval_dataset = dataset[\"test\"]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "812cfcc8-33ec-4a2b-8a74-27bfc2a41d7b",
- "metadata": {},
- "source": [
- "## Enhanced Training with SFTTrainer"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "6dc85c57-68b2-4514-9373-43e3d7c05c10",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': dataset_text_field, max_seq_length, packing. Will not be supported from version '0.13.0'.\n",
- "\n",
- "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
- " warnings.warn(message, FutureWarning)\n",
- "/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "Generating train split: 1820 examples [00:02, 613.71 examples/s]\n",
- "Generating train split: 209 examples [00:00, 582.95 examples/s]\n"
- ]
- }
- ],
- "source": [
- "trainer = SFTTrainer(\n",
- " model=model,\n",
- " train_dataset=train_dataset,\n",
- " eval_dataset=eval_dataset,\n",
- " peft_config=peft_config,\n",
- " dataset_text_field=\"text\",\n",
- " max_seq_length=1024,\n",
- " tokenizer=tokenizer,\n",
- " args=training_args,\n",
- " packing=True,\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "810eb75e",
- "metadata": {},
- "source": [
- "## Federated Averaging Function"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "58298e8e-ab9e-4377-966e-143823441697",
- "metadata": {},
- "outputs": [],
- "source": [
- "def FedAvg(peft_params, model, weights=None):\n",
- " \"\"\"\n",
- " Perform Federated Averaging (FedAvg) on the model parameters.\n",
- " \n",
- " This function aggregates PEFT parameters from multiple collaborators using weighted\n",
- " averaging. It handles the complex task of averaging parameters while maintaining \n",
- " the correct tensor types and shapes required by the PEFT framework.\n",
- " \n",
- " Args:\n",
- " peft_params (list): A list of state dictionaries containing PEFT parameters from different collaborators.\n",
- " model (torch.nn.Module): The base model to which the averaged parameters will be applied.\n",
- " weights (list, optional): A list of weights for averaging the parameters. If None, equal weights are used.\n",
- " Weights determine the contribution of each collaborator to the final model.\n",
- " \n",
- " Returns:\n",
- " torch.nn.Module: The model with the averaged parameters applied.\n",
- " \n",
- " Notes:\n",
- " The function converts tensors to float for averaging to avoid precision issues,\n",
- " then converts back to the original data type for model compatibility.\n",
- " \"\"\"\n",
- " # Store the state dictionaries for easy access\n",
- " state_dicts = peft_params\n",
- " # Get the current state dict from the model as a template\n",
- " state_dict = get_peft_model_state_dict(model)\n",
- " \n",
- " # Iterate through each parameter in the first state dict as reference\n",
- " for key in peft_params[0]:\n",
- " # Store original data type for later conversion\n",
- " dtype = state_dicts[0][key].dtype\n",
- " \n",
- " # Convert all tensors to float, move to CPU, perform weighted average\n",
- " state_dict[key] = torch.from_numpy(\n",
- " np.average(\n",
- " [state[key].cpu().to(torch.float).numpy() for state in state_dicts], \n",
- " axis=0, \n",
- " weights=weights\n",
- " )\n",
- " ).to(dtype) # Convert back to original data type\n",
- " \n",
- " # Apply the averaged parameters back to the model\n",
- " set_peft_model_state_dict(model, state_dict)\n",
- " return model"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "e120a656-f4a5-47a5-a3d4-62c5f3672bba",
- "metadata": {},
- "source": [
- "## Federated Learning Workflow"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e5e108c6-5150-4931-9c01-6b64a913fa04",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Aggregator step \"start\" registered\n",
- "Collaborator step \"aggregated_model_validation\" registered\n",
- "Collaborator step \"train\" registered\n",
- "Collaborator step \"local_model_validation\" registered\n",
- "Aggregator step \"join\" registered\n",
- "Aggregator step \"end\" registered\n"
- ]
- }
- ],
- "source": [
- "# Import the required PrinterCallback for proper initialization/removal\n",
- "from transformers.trainer_callback import PrinterCallback\n",
- "import transformers\n",
- "\n",
- "class FederatedFlow(FLSpec):\n",
- " \"\"\"\n",
- " Federated Learning workflow for fine-tuning Phi-4 model with PEFT and quantization.\n",
- " \n",
- " This class implements the complete federated learning workflow for a language model,\n",
- " including initialization, aggregated model validation, training, local model validation,\n",
- " and parameter aggregation. It uses Parameter-Efficient Fine-Tuning (PEFT) with 4-bit\n",
- " quantization to efficiently train large language models in memory-constrained environments.\n",
- " \n",
- " The workflow follows these steps for each round:\n",
- " 1. Initialize model on each collaborator\n",
- " 2. Validate the aggregated model on local data\n",
- " 3. Train the model locally on each collaborator\n",
- " 4. Validate the locally trained model\n",
- " 5. Aggregate PEFT parameters from all collaborators using FedAvg\n",
- " 6. Repeat for specified number of rounds\n",
- " \n",
- " Attributes:\n",
- " model: The base language model being fine-tuned\n",
- " peft_params: PEFT parameters dictionary for the model\n",
- " optimizer: Optimizer for training (optional)\n",
- " rounds: Number of federated learning rounds to perform\n",
- " current_round: Counter for the current round\n",
- " collaborators: List of collaborators participating in federated learning\n",
- " \"\"\"\n",
- " def __init__(self, model=None, optimizer=None, rounds=3, **kwargs):\n",
- " \"\"\"\n",
- " Initialize the federated learning workflow.\n",
- " \n",
- " Args:\n",
- " model: The base language model to fine-tune. Must be provided.\n",
- " optimizer: Optional optimizer for model training.\n",
- " rounds: Number of federated learning rounds to perform (default: 3).\n",
- " **kwargs: Additional arguments passed to the parent class.\n",
- " \n",
- " Raises:\n",
- " ValueError: If no model is provided.\n",
- " \"\"\"\n",
- " super().__init__(**kwargs)\n",
- " if model is not None:\n",
- " self.model = model\n",
- " self.peft_params = get_peft_model_state_dict(self.model)\n",
- " self.optimizer = optimizer\n",
- " else:\n",
- " raise ValueError(\"No model inputted\")\n",
- "\n",
- " self.rounds = rounds\n",
- " \n",
- "\n",
- " @aggregator\n",
- " def start(self):\n",
- " \"\"\"\n",
- " Start the federated learning process on the aggregator.\n",
- " \n",
- " This method initializes the workflow by:\n",
- " 1. Setting up the list of collaborators from the runtime\n",
- " 2. Initializing the current round counter\n",
- " 3. Starting the first step of the workflow by sending the model\n",
- " to all collaborators for validation\n",
- " \n",
- " The @aggregator decorator ensures this method runs on the aggregator node.\n",
- " \"\"\"\n",
- " print(f\"Performing initialization for model\")\n",
- " self.collaborators = self.runtime.collaborators\n",
- " self.current_round = 0\n",
- " # Start the workflow by sending the model to all collaborators\n",
- " self.next(\n",
- " self.aggregated_model_validation,\n",
- " foreach=\"collaborators\",\n",
- " )\n",
- "\n",
- " \n",
- " @collaborator\n",
- " def aggregated_model_validation(self):\n",
- " \"\"\"\n",
- " Validate the aggregated model on each collaborator's local dataset.\n",
- " \n",
- " This method:\n",
- " 1. Loads the model with appropriate quantization configuration\n",
- " 2. Applies the PEFT configuration and parameters\n",
- " 3. Creates a trainer with local validation dataset\n",
- " 4. Evaluates the model and records the validation loss\n",
- " 5. Transitions to the training phase\n",
- " \n",
- " The @collaborator decorator ensures this method runs on each collaborator node.\n",
- " \n",
- " Notes:\n",
- " Includes fallback to CPU if GPU memory is insufficient\n",
- " \"\"\"\n",
- " print(f\"Performing aggregated model validation for collaborator {self.input}\")\n",
- " # Load model with quantization and CPU offloading if needed\n",
- " device_map = \"auto\" \n",
- " try:\n",
- " # Try to load model on GPU with quantization\n",
- " self.model = AutoModelForCausalLM.from_pretrained(\n",
- " model_name,\n",
- " quantization_config=bnb_config,\n",
- " device_map=device_map,\n",
- " #max_memory={0: \"4GiB\", \"cpu\": \"24GiB\"},\n",
- " trust_remote_code=True\n",
- " )\n",
- " except ValueError:\n",
- " # Fallback to CPU if GPU memory is insufficient\n",
- " print(f\"Falling back to CPU mode for {self.input}\")\n",
- " self.model = AutoModelForCausalLM.from_pretrained(\n",
- " model_name,\n",
- " device_map=\"cpu\",\n",
- " trust_remote_code=True\n",
- " )\n",
- " \n",
- " # Prepare model for training with quantization\n",
- " self.model = prepare_model_for_kbit_training(self.model)\n",
- " # Apply PEFT configuration (LoRA)\n",
- " self.model = get_peft_model(self.model, peft_config)\n",
- " # Load aggregated parameters\n",
- " set_peft_model_state_dict(self.model, self.peft_params)\n",
- " \n",
- " # Setup trainer for evaluation\n",
- " trainer = SFTTrainer(\n",
- " model=self.model,\n",
- " args=training_args,\n",
- " peft_config=peft_config,\n",
- " train_dataset=self.train_dataset,\n",
- " eval_dataset=self.eval_dataset,\n",
- " max_seq_length=1024,\n",
- " dataset_text_field=\"text\",\n",
- " tokenizer=tokenizer,\n",
- " packing=True,\n",
- " data_collator=transformers.DataCollatorForSeq2Seq(\n",
- " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n",
- " ),\n",
- " )\n",
- "\n",
- " # Remove default printer callback to avoid verbose output\n",
- " trainer.remove_callback(PrinterCallback)\n",
- " # Evaluate model and store metrics\n",
- " out = trainer.evaluate()\n",
- " self.agg_validation_score = out[\"eval_loss\"]\n",
- " print(f\"{self.input} value of {self.agg_validation_score}\")\n",
- " # Move to training phase\n",
- " self.next(self.train)\n",
- "\n",
- " @collaborator\n",
- " def train(self):\n",
- " \"\"\"\n",
- " Train the model on each collaborator's local dataset.\n",
- " \n",
- " This method:\n",
- " 1. Creates an SFTTrainer with the local training dataset\n",
- " 2. Runs the training process\n",
- " 3. Records the training loss\n",
- " 4. Saves the trained model\n",
- " 5. Transitions to local validation phase\n",
- " \n",
- " The @collaborator decorator ensures this method runs on each collaborator node.\n",
- " \"\"\"\n",
- " # Setup trainer for local training\n",
- " trainer = SFTTrainer(\n",
- " model=self.model,\n",
- " args=training_args,\n",
- " peft_config=peft_config,\n",
- " train_dataset=self.train_dataset,\n",
- " eval_dataset=self.eval_dataset,\n",
- " max_seq_length=1024,\n",
- " dataset_text_field=\"text\",\n",
- " tokenizer=tokenizer,\n",
- " packing=True,\n",
- " data_collator=transformers.DataCollatorForSeq2Seq(\n",
- " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n",
- " ),\n",
- " )\n",
- "\n",
- " # Execute training\n",
- " out = trainer.train()\n",
- " # Store training loss for later analysis\n",
- " self.loss = out.training_loss\n",
- " # Save locally trained model\n",
- " trainer.save_model()\n",
- " self.training_completed = True\n",
- " # Move to local validation phase\n",
- " self.next(self.local_model_validation)\n",
- "\n",
- " @collaborator\n",
- " def local_model_validation(self):\n",
- " \"\"\"\n",
- " Validate the locally trained model on each collaborator's validation dataset.\n",
- " \n",
- " This method:\n",
- " 1. Creates an SFTTrainer with the local validation dataset\n",
- " 2. Evaluates the locally trained model\n",
- " 3. Records the validation loss\n",
- " 4. Extracts the PEFT parameters for aggregation\n",
- " 5. Sends results to the aggregator for parameter aggregation\n",
- " \n",
- " The @collaborator decorator ensures this method runs on each collaborator node.\n",
- " \n",
- " Notes:\n",
- " Excludes the full model and training flags from the data sent to the aggregator\n",
- " to reduce communication overhead\n",
- " \"\"\"\n",
- " # Setup trainer for evaluation\n",
- " trainer = SFTTrainer(\n",
- " model=self.model,\n",
- " args=training_args,\n",
- " peft_config=peft_config,\n",
- " train_dataset=self.train_dataset,\n",
- " eval_dataset=self.eval_dataset,\n",
- " max_seq_length=1024,\n",
- " dataset_text_field=\"text\",\n",
- " tokenizer=tokenizer,\n",
- " packing=True,\n",
- " data_collator=transformers.DataCollatorForSeq2Seq(\n",
- " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n",
- " ),\n",
- " )\n",
- " # Evaluate the locally trained model\n",
- " out = trainer.evaluate()\n",
- " self.local_validation_score = out[\"eval_loss\"]\n",
- " # Extract PEFT parameters for aggregation\n",
- " self.peft_params = get_peft_model_state_dict(self.model)\n",
- " print(f\"Doing local model validation for collaborator {self.input}\")\n",
- " # Send results to aggregator, excluding the full model and training flags\n",
- " # to reduce communication overhead\n",
- " self.next(self.join, exclude=[\"training_completed\", \"model\"])\n",
- "\n",
- " @aggregator\n",
- " def join(self, inputs):\n",
- " \"\"\"\n",
- " Aggregate results from all collaborators and update the global model.\n",
- " \n",
- " This method:\n",
- " 1. Calculates average loss, aggregated model accuracy, and local model accuracy\n",
- " 2. Updates the global model using Federated Averaging (FedAvg)\n",
- " 3. Saves the aggregated model and tokenizer\n",
- " 4. Either starts the next round or ends the workflow depending on round count\n",
- " \n",
- " Args:\n",
- " inputs: List of data objects from all collaborators containing validation scores\n",
- " and PEFT parameters.\n",
- " \n",
- " The @aggregator decorator ensures this method runs on the aggregator node.\n",
- " \"\"\"\n",
- " # Calculate average metrics across all collaborators\n",
- " self.average_loss = sum(input.loss for input in inputs) / len(inputs)\n",
- " self.aggregated_model_accuracy = sum(\n",
- " input.agg_validation_score for input in inputs\n",
- " ) / len(inputs)\n",
- " self.local_model_accuracy = sum(\n",
- " input.local_validation_score for input in inputs\n",
- " ) / len(inputs)\n",
- " \n",
- " # Display aggregated metrics\n",
- " print(\n",
- " f\"Average aggregated model validation values = {self.aggregated_model_accuracy}\"\n",
- " )\n",
- " print(f\"Average training loss = {self.average_loss}\")\n",
- " print(f\"Average local model validation values = {self.local_model_accuracy}\")\n",
- "\n",
- " # Perform federated averaging of model parameters\n",
- " self.model = FedAvg([input.peft_params for input in inputs], self.model)\n",
- " self.peft_params = get_peft_model_state_dict(self.model)\n",
- "\n",
- " # Save the aggregated model for future use\n",
- " self.model.save_pretrained(\"./aggregated/model\")\n",
- " tokenizer.save_pretrained(\"./aggregated/tokenizer\")\n",
- " \n",
- " # Increment round counter and start next round or end workflow\n",
- " self.current_round += 1\n",
- " if self.current_round < self.rounds:\n",
- " self.next(\n",
- " self.aggregated_model_validation,\n",
- " foreach=\"collaborators\",\n",
- " exclude=[\"model\"],\n",
- " )\n",
- " else:\n",
- " self.next(self.end)\n",
- "\n",
- " @aggregator\n",
- " def end(self):\n",
- " \"\"\"\n",
- " End the federated learning process.\n",
- " \n",
- " This method marks the end of the federated learning workflow after all rounds\n",
- " have been completed. The final aggregated model and tokenizer are already saved\n",
- " in the last join step.\n",
- " \n",
- " The @aggregator decorator ensures this method runs on the aggregator node.\n",
- " \"\"\"\n",
- " print(f\"This is the end of the flow\")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7bc8fe27",
- "metadata": {},
- "source": [
- "## Run Federated Learning"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "38894111-41d9-4dd4-b1c8-eb7ec3cdd3e1",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Calling start\n",
- "\u001b[94mPerforming initialization for model\u001b[0m\u001b[94m\n",
- "\u001b[0m\n",
- "Calling aggregated_model_validation\n",
- "\u001b[94mPerforming aggregated model validation for collaborator Portland\u001b[0m\u001b[94m\n",
- "\u001b[0m"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.30it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
- "\n",
- "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
- " warnings.warn(message, FutureWarning)\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "Generating train split: 913 examples [00:01, 623.08 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n",
- "Generating train split: 104 examples [00:00, 583.62 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n",
- "\u001b[0m"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "
\n",
- " \n",
- " \n",
- " [13/13 00:12]\n",
- "
\n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[94mPortland value of 0.5918120741844177\u001b[0m\u001b[94m\n",
- "\u001b[0m\n",
- "Calling train\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
- "\n",
- "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
- " warnings.warn(message, FutureWarning)\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "Generating train split: 913 examples [00:01, 616.37 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n",
- "Generating train split: 104 examples [00:00, 615.85 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n",
- "\u001b[0m`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "
"
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Calling local_model_validation\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
- "\n",
- "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
- " warnings.warn(message, FutureWarning)\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "
\n",
- " \n",
- " \n",
- " [13/13 00:13]\n",
- "
\n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
- "\u001b[0mShould transfer from local_model_validation to join\n",
- "\n",
- "Calling aggregated_model_validation\n",
- "\u001b[94mPerforming aggregated model validation for collaborator Seattle\u001b[0m\u001b[94m\n",
- "\u001b[0m"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.32it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
- "\n",
- "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
- " warnings.warn(message, FutureWarning)\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "Generating train split: 907 examples [00:01, 626.09 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n",
- "Generating train split: 105 examples [00:00, 634.41 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n",
- "\u001b[0m"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "
\n",
- " \n",
- " \n",
- " [14/14 00:13]\n",
- "
\n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[94mSeattle value of 0.589488685131073\u001b[0m\u001b[94m\n",
- "\u001b[0m\n",
- "Calling train\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
- "\n",
- "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
- " warnings.warn(message, FutureWarning)\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "
"
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Calling local_model_validation\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
- "\n",
- "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
- " warnings.warn(message, FutureWarning)\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "
\n",
- " \n",
- " \n",
- " [14/14 00:13]\n",
- "
\n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n",
- "\u001b[0mShould transfer from local_model_validation to join\n",
- "\n",
- "Calling join\n",
- "\u001b[94mAverage aggregated model validation values = 0.5906503796577454\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94mAverage training loss = 0.3295206361469617\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94mAverage local model validation values = 0.3146952837705612\u001b[0m\u001b[94m\n",
- "\u001b[0m\n",
- "Calling aggregated_model_validation\n",
- "\u001b[94mPerforming aggregated model validation for collaborator Portland\u001b[0m\u001b[94m\n",
- "\u001b[0m"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.33it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
- "\n",
- "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
- " warnings.warn(message, FutureWarning)\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "
\n",
- " \n",
- " \n",
- " [13/13 00:13]\n",
- "
\n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[94mPortland value of 0.31504756212234497\u001b[0m\u001b[94m\n",
- "\u001b[0m\n",
- "Calling train\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
- "\n",
- "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
- " warnings.warn(message, FutureWarning)\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "
"
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Calling local_model_validation\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
- "\n",
- "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
- " warnings.warn(message, FutureWarning)\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "
\n",
- " \n",
- " \n",
- " [13/13 00:13]\n",
- "
\n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
- "\u001b[0mShould transfer from local_model_validation to join\n",
- "\n",
- "Calling aggregated_model_validation\n",
- "\u001b[94mPerforming aggregated model validation for collaborator Seattle\u001b[0m\u001b[94m\n",
- "\u001b[0m"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.30it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
- "\n",
- "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
- " warnings.warn(message, FutureWarning)\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "
\n",
- " \n",
- " \n",
- " [14/14 00:13]\n",
- "
\n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[94mSeattle value of 0.31057578325271606\u001b[0m\u001b[94m\n",
- "\u001b[0m\n",
- "Calling train\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
- "\n",
- "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
- " warnings.warn(message, FutureWarning)\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
- "\u001b[0m"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "
\n",
" "
],
@@ -1118,7 +1169,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "\u001b[94mPortland value of 0.5819987058639526\u001b[0m\u001b[94m\n",
+ "\u001b[94mPortland evaluation loss: 0.5811071991920471\u001b[0m\u001b[94m\n",
"\u001b[0m\n",
"Calling train\n"
]
@@ -1127,141 +1178,63 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
"\n",
"Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
" warnings.warn(message, FutureWarning)\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
" warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
" warnings.warn(\n",
- "Generating train split: 915 examples [00:01, 619.44 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n",
- "Generating train split: 96 examples [00:00, 628.10 examples/s]\u001b[0m\u001b[94m0m\u001b[94m\n",
- "\u001b[0m`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...\n"
+ "Map: 100%|##########| 4464/4464 [00:01<00:00, 3487.12 examples/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "Map: 100%|##########| 496/496 [00:00<00:00, 3341.29 examples/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n",
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...\n",
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
+ " return fn(*args, **kwargs)\n",
+ "\u001b[0m"
]
},
- {
- "data": {
- "text/html": [
- "\n",
- "
\n",
" "
],
@@ -1379,7 +1376,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "\u001b[94mSeattle value of 0.5914124846458435\u001b[0m\u001b[94m\n",
+ "\u001b[94mSeattle evaluation loss: 0.5805659294128418\u001b[0m\u001b[94m\n",
"\u001b[0m\n",
"Calling train\n"
]
@@ -1388,139 +1385,62 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
"\n",
"Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
" warnings.warn(message, FutureWarning)\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
" warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
" warnings.warn(\n",
+ "Map: 100%|##########| 4463/4463 [00:01<00:00, 3422.90 examples/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "Map: 100%|##########| 496/496 [00:00<00:00, 3404.21 examples/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n",
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
+ " return fn(*args, **kwargs)\n",
"\u001b[0m"
]
},
- {
- "data": {
- "text/html": [
- "\n",
- "
\n",
" "
],
@@ -1641,7 +1585,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "\u001b[94mPortland value of 0.30139902234077454\u001b[0m\u001b[94m\n",
+ "\u001b[94mPortland evaluation loss: 0.38848692178726196\u001b[0m\u001b[94m\n",
"\u001b[0m\n",
"Calling train\n"
]
@@ -1650,139 +1594,62 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
"\n",
"Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
" warnings.warn(message, FutureWarning)\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
" warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
" warnings.warn(\n",
+ "Map: 100%|##########| 496/496 [00:00<00:00, 3328.62 examples/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n",
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
+ " return fn(*args, **kwargs)\n",
"\u001b[0m"
]
},
- {
- "data": {
- "text/html": [
- "\n",
- "
\n",
" "
],
@@ -1825,36 +1696,37 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
+ "\u001b[94m[Portland] Local evaluation loss: 0.3903903663158417\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m\n",
"==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Allocated: 35535.52 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Reserved: 59824.00 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33547.22 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 57726.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55734.72 MB\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m\n",
"Memory Usage by Stage:\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Allocated: 33255.83 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Reserved: 33440.00 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 31364.12 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 31586.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 54883.62 MB\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Allocated: 35322.87 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Reserved: 59824.00 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33378.03 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 57670.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55734.72 MB\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Allocated: 35535.52 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Reserved: 41948.00 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33547.22 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 57726.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55734.72 MB\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m\n",
"Performance Metrics:\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Training Loss: 0.2949\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Evaluation Loss: 0.2986\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: 0.2645\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.3904\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
"\u001b[0mShould transfer from local_model_validation to join\n",
"\n",
"Calling aggregated_model_validation\n",
- "\u001b[94mPerforming aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n",
+ "\u001b[94m[Round 0, Update 1] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n",
"\u001b[0m"
]
},
@@ -1862,7 +1734,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.27it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
"\n",
"Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
@@ -1873,6 +1745,23 @@
" warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
" warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
"\u001b[0m"
]
},
@@ -1898,11 +1787,74 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "\u001b[94mSeattle value of 0.3157660961151123\u001b[0m\u001b[94m\n",
+ "\u001b[94mSeattle evaluation loss: 0.3931271433830261\u001b[0m\u001b[94m\n",
"\u001b[0m\n",
"Calling train\n"
]
},
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "Map: 100%|##########| 496/496 [00:00<00:00, 3406.01 examples/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n",
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
+ " return fn(*args, **kwargs)\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2763\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2785\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2815\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2831\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2853\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2878\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2919\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2938\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2690\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2709\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2725\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2735\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2749\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2798\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2815\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2844\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2730\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2736\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2743\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2757\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2770\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2778\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2785\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2795\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2696\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2701\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2707\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2717\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2725\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2730\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2740\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2748\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.2659\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.2659\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling local_model_validation\n"
+ ]
+ },
{
"name": "stderr",
"output_type": "stream",
@@ -1917,6 +1869,10 @@
" warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
" warnings.warn(\n",
+ "Generating train split: 101 examples [00:00, 653.25 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n",
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
"\u001b[0m"
]
},
@@ -1926,104 +1882,8265 @@
"\n",
"
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Local evaluation loss: 0.3981446623802185\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33447.23 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 57866.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55734.72 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 31364.12 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 31506.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55734.72 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33378.03 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 57810.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55734.72 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33447.23 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 57866.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55734.72 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: 0.2659\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.3981\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling join\n",
+ "\u001b[94m[Round 0, Update 1] Average aggregated model validation loss = 0.39080703258514404\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 0, Update 1] Average training loss = 0.2652348279953003\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 0, Update 1] Average local model validation loss = 0.3942675143480301\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 1, Update 0] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.23it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mPortland evaluation loss: 0.38319262862205505\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
+ " return fn(*args, **kwargs)\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Completed 10 steps, current loss: 0.2195\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2213\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2243\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2270\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2291\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2310\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2341\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2371\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2230\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2253\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2263\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2280\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2298\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2311\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2324\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2346\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2331\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2338\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2351\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2361\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2378\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2385\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2398\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2408\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2375\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2379\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2386\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2398\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2404\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2413\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2421\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2428\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.2315\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.2315\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Local evaluation loss: 0.392806738615036\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33706.76 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58286.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.26 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 31523.66 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 31766.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55734.72 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33537.57 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58230.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.26 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33706.76 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58286.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.26 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: 0.2315\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.3928\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 1, Update 0] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mSeattle evaluation loss: 0.3884606957435608\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
+ " return fn(*args, **kwargs)\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2279\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2298\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2329\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2345\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2363\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2382\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2412\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2428\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2262\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2280\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2295\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2305\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2317\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2359\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2375\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2394\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2341\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2347\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2354\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2367\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2378\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2386\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2392\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2402\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2352\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2356\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2361\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2369\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2376\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2381\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2390\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2397\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.2306\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.2306\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Local evaluation loss: 0.4016312062740326\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33606.77 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58166.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.26 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 31523.66 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 31784.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.26 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33537.57 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58110.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.26 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33606.77 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58166.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.26 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: 0.2306\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.4016\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling join\n",
+ "\u001b[94m[Round 1, Update 0] Average aggregated model validation loss = 0.3858266621828079\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 1, Update 0] Average training loss = 0.23103156685829163\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 1, Update 0] Average local model validation loss = 0.3972189724445343\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 1, Update 1] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mPortland evaluation loss: 0.38549479842185974\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
+ " return fn(*args, **kwargs)\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Completed 10 steps, current loss: 0.1714\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1729\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1755\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1777\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1793\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1809\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1836\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1854\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1804\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1824\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1834\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1847\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1863\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1876\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1888\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1907\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1972\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1979\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1991\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2000\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2014\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2021\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2032\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2043\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2048\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2052\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2057\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2068\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2073\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2081\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2088\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2095\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.1995\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.1995\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Local evaluation loss: 0.401896208524704\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33706.76 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58226.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.27 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 31523.66 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 31748.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.26 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33537.58 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58170.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.27 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33706.76 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58226.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.27 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: 0.1995\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.4019\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 1, Update 1] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:12]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mSeattle evaluation loss: 0.3921719491481781\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
+ " return fn(*args, **kwargs)\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1773\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1793\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1823\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1835\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1851\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1866\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1891\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1905\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1832\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1849\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1863\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1870\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1881\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1913\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1927\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1940\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1960\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1966\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1973\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1984\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1993\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2001\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2007\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2017\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1996\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2001\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2005\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2011\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2018\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2023\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2031\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2037\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.1968\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.1968\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Local evaluation loss: 0.413899302482605\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33606.77 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 57886.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.27 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 31523.66 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 31770.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.27 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33537.58 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 57830.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.27 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33606.77 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 57886.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.27 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: 0.1968\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.4139\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling join\n",
+ "\u001b[94m[Round 1, Update 1] Average aggregated model validation loss = 0.3888333737850189\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 1, Update 1] Average training loss = 0.19815459847450256\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 1, Update 1] Average local model validation loss = 0.4078977555036545\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 2, Update 0] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mPortland evaluation loss: 0.39401885867118835\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
+ " return fn(*args, **kwargs)\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Completed 10 steps, current loss: 0.1237\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1246\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1261\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1270\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1279\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1287\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1301\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1316\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1355\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1373\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1381\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1395\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1409\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1420\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1430\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1444\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1612\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1619\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1628\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1637\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1649\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1656\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1666\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1675\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1733\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1737\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1742\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1751\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1756\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1762\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1768\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1775\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.1710\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.1710\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Local evaluation loss: 0.41375958919525146\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33706.77 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58186.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 31523.67 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 31838.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.27 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33537.59 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58130.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33706.77 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58186.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: 0.1710\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.4138\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 2, Update 0] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:12]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mSeattle evaluation loss: 0.40143540501594543\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
+ " return fn(*args, **kwargs)\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1366\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1386\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1411\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1418\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1435\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1452\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1468\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1478\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1464\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1478\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1488\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1494\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1505\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1524\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1537\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1551\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1653\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1658\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1664\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1673\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1681\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1689\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1695\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1704\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1743\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1748\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1751\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1757\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1764\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1768\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1775\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1780\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.1720\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.1720\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Local evaluation loss: 0.42323189973831177\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33606.78 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58206.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 31523.67 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 31798.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33537.59 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58150.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33606.78 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58206.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: 0.1720\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.4232\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling join\n",
+ "\u001b[94m[Round 2, Update 0] Average aggregated model validation loss = 0.3977271318435669\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 2, Update 0] Average training loss = 0.17152628302574158\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 2, Update 0] Average local model validation loss = 0.4184957444667816\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 2, Update 1] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mPortland evaluation loss: 0.40234553813934326\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
+ " return fn(*args, **kwargs)\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Completed 10 steps, current loss: 0.0984\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0989\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0996\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1004\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1011\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1018\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1027\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1035\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.1473\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.1473\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Local evaluation loss: 0.4498145878314972\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33706.78 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 57986.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 31523.68 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 31762.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33537.59 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 57930.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33706.78 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 57986.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: 0.1473\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.4498\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 2, Update 1] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:12]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mSeattle evaluation loss: 0.41048040986061096\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
+ " return fn(*args, **kwargs)\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1032\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1041\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1049\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1058\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1065\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1075\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1085\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1092\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1116\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1130\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1140\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1148\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1160\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1172\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1186\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1195\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1353\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1358\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1363\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1370\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1377\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1384\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1390\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1397\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1460\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1463\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1467\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1472\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1478\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1482\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1489\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1493\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.1441\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.1441\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Local evaluation loss: 0.44478899240493774\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33606.79 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58266.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 31523.68 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 31822.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33537.59 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58210.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33606.79 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58266.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: 0.1441\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.4448\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling join\n",
+ "\u001b[94m[Round 2, Update 1] Average aggregated model validation loss = 0.4064129739999771\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 2, Update 1] Average training loss = 0.14573591947555542\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 2, Update 1] Average local model validation loss = 0.44730179011821747\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 3, Update 0] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mPortland evaluation loss: 0.4368877112865448\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
+ " return fn(*args, **kwargs)\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Completed 10 steps, current loss: 0.0855\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0863\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0871\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0882\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0889\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0895\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0902\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0911\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0842\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0857\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0862\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0868\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0874\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0882\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0887\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0894\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1078\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1084\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1092\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1099\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1108\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1113\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1121\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1129\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1229\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1232\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1235\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1241\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1245\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1249\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1254\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1259\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.1210\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.1210\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Local evaluation loss: 0.4723173975944519\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33706.78 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58146.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.29 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 31523.68 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 31782.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33537.60 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58090.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.29 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33706.78 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58146.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.29 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: 0.1210\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.4723\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 3, Update 0] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:12]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mSeattle evaluation loss: 0.44539061188697815\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
+ " return fn(*args, **kwargs)\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0859\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0867\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0880\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0891\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0899\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0907\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0927\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0936\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0927\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0941\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0948\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0954\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0963\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0970\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0981\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0987\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1174\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1178\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1183\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1188\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1197\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1203\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1210\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1218\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1301\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1304\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1308\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1312\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1317\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1321\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1326\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1330\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.1277\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.1277\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Local evaluation loss: 0.4785761833190918\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33606.79 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58106.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.29 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 31523.69 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 31914.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.29 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33537.60 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58050.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.29 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33606.79 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58106.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.29 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: 0.1277\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.4786\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling join\n",
+ "\u001b[94m[Round 3, Update 0] Average aggregated model validation loss = 0.4411391615867615\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 3, Update 0] Average training loss = 0.12434957921504974\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 3, Update 0] Average local model validation loss = 0.47544679045677185\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 3, Update 1] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mPortland evaluation loss: 0.47256991267204285\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
+ " return fn(*args, **kwargs)\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Completed 10 steps, current loss: 0.0721\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0727\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0735\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0744\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0752\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0759\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0768\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0777\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0733\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0750\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0756\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0761\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0766\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0771\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0780\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0786\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.1022\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.1022\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Local evaluation loss: 0.5270882248878479\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33706.79 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58346.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 31523.69 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 31804.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.29 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33537.61 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58290.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33706.79 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58346.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: 0.1022\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.5271\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 3, Update 1] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:12]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mSeattle evaluation loss: 0.48190006613731384\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
+ " return fn(*args, **kwargs)\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0685\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0691\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0700\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0707\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0716\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0723\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0734\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0744\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0731\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0741\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0750\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0754\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0761\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0766\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0772\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0777\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0940\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0945\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0949\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0954\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0960\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0966\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0972\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0978\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1058\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1061\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1064\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1068\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1073\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1076\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1082\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1085\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.1042\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.1042\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Local evaluation loss: 0.528550386428833\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33606.80 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58384.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 31523.69 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 31838.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33537.61 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58328.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33606.80 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58384.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: 0.1042\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.5286\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling join\n",
+ "\u001b[94m[Round 3, Update 1] Average aggregated model validation loss = 0.47723498940467834\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 3, Update 1] Average training loss = 0.10317578911781311\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 3, Update 1] Average local model validation loss = 0.5278193056583405\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 4, Update 0] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Completed 30 steps, current loss: 0.0727\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0730\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0734\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0739\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0744\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0748\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0754\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0759\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0824\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0826\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0828\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0831\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0832\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0835\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0838\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0841\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.0805\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.0805\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Local evaluation loss: 0.5606555938720703\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33706.80 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58806.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 31523.70 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 31916.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33537.62 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58748.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33706.80 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58806.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: 0.0805\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.5607\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 4, Update 0] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:12]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mSeattle evaluation loss: 0.5261728763580322\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
+ " return fn(*args, **kwargs)\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0576\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0584\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0589\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0594\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0602\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0610\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0617\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0622\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0613\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0622\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0627\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0631\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0637\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0642\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0649\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0653\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0763\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0766\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0770\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0773\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0778\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0783\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0788\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0791\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0871\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0873\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0875\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0878\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0882\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0885\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0891\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0893\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.0857\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.0857\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Local evaluation loss: 0.5728362202644348\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33606.81 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58444.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 31523.70 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 32006.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33537.62 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58388.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33606.81 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 58444.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: 0.0857\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.5728\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling join\n",
+ "\u001b[94m[Round 4, Update 0] Average aggregated model validation loss = 0.5190570056438446\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 4, Update 0] Average training loss = 0.08311298489570618\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 4, Update 0] Average local model validation loss = 0.5667459070682526\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 4, Update 1] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.26it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mPortland evaluation loss: 0.5445127487182617\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
+ " return fn(*args, **kwargs)\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Completed 10 steps, current loss: 0.0533\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0536\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0543\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0551\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0557\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0563\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0570\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0577\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0569\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0578\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0583\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0587\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0590\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0595\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0600\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0604\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0669\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0672\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0675\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0677\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0682\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0684\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0688\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0694\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0748\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0750\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0752\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0755\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0757\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0759\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0763\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0766\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.0734\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.0734\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:13]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Local evaluation loss: 0.5720435380935669\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33706.81 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 57946.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 31523.71 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 31786.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33537.62 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 57888.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 33706.81 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 57946.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: 0.0734\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.5720\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 4, Update 1] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:12]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mSeattle evaluation loss: 0.562188982963562\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
+ " return fn(*args, **kwargs)\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0550\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0556\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0563\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0569\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0575\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0582\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0598\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0602\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0583\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0590\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0594\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0597\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0605\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0610\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0614\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0619\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0678\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0680\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0683\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0686\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0689\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0692\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0696\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0699\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0732\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0734\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0736\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0738\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0741\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0743\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0746\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0748\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.0720\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.0720\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mPortland evaluation loss: 0.561655580997467\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:12]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Local evaluation loss: 0.3953164219856262\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 39210.98 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 59636.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 56253.51 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 36814.91 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 38196.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 39210.98 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 59636.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 56253.51 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 39008.34 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 42136.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 56253.51 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.3953\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 0, Update 0] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.26it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mSeattle evaluation loss: 0.5638197064399719\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Local evaluation loss: 0.40347251296043396\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 39317.46 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 59596.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 56364.86 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 36974.44 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 38276.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 56364.86 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 39317.46 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 59596.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 56364.86 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 39149.93 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 42176.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 56364.86 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.4035\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling join\n",
+ "\u001b[94m[Round 0, Update 0] Average aggregated model validation loss = 0.5627376437187195\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 0, Update 0] Average training loss = inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 0, Update 0] Average local model validation loss = 0.3993944674730301\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 0, Update 1] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mPortland evaluation loss: 0.395594984292984\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Local evaluation loss: 0.39317232370376587\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40105.33 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 60902.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57210.81 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 37772.41 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 38798.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57164.72 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40105.33 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 60902.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57210.81 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 39841.12 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 42898.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57210.81 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.3932\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling join\n",
+ "\u001b[94m[Round 0, Update 1] Average aggregated model validation loss = 0.3971341550350189\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 0, Update 1] Average training loss = inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 0, Update 1] Average local model validation loss = 0.39462728798389435\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 1, Update 0] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mPortland evaluation loss: 0.3897101879119873\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Local evaluation loss: 0.40585511922836304\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40210.68 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 61118.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57272.77 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 37825.69 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 38822.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57214.60 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40210.68 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 61118.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57272.77 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 39916.32 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 42894.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57272.77 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.4059\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 1, Update 0] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mSeattle evaluation loss: 0.3935319185256958\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Local evaluation loss: 0.40515169501304626\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40215.71 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 60930.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57319.97 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 37878.81 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 38916.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57272.77 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40215.71 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 60930.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57319.97 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 39949.17 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 42986.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57319.97 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.4052\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling join\n",
+ "\u001b[94m[Round 1, Update 0] Average aggregated model validation loss = 0.39162105321884155\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 1, Update 0] Average training loss = inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 1, Update 0] Average local model validation loss = 0.40550340712070465\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 1, Update 1] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mPortland evaluation loss: 0.3948669135570526\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Local evaluation loss: 0.4210392236709595\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40210.16 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 60316.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 37825.69 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 38794.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40210.16 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 60316.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 39911.82 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 42890.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.4210\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 1, Update 1] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mSeattle evaluation loss: 0.39981263875961304\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Local evaluation loss: 0.4217308461666107\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40218.59 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 61308.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 37878.81 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 38902.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40218.59 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 61308.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 39946.36 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 42924.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.4217\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling join\n",
+ "\u001b[94m[Round 1, Update 1] Average aggregated model validation loss = 0.3973397761583328\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 1, Update 1] Average training loss = inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 1, Update 1] Average local model validation loss = 0.4213850349187851\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 2, Update 0] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mPortland evaluation loss: 0.40616321563720703\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Local evaluation loss: 0.5076923370361328\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40203.53 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 61136.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 37825.69 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 38848.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40203.53 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 61136.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 39909.20 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 42830.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.5077\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 2, Update 0] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mSeattle evaluation loss: 0.41194236278533936\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Local evaluation loss: 0.4701308310031891\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40210.40 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 61008.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 37878.81 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 38938.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40210.40 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 61008.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 39941.88 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 43064.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.4701\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling join\n",
+ "\u001b[94m[Round 2, Update 0] Average aggregated model validation loss = 0.4090527892112732\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 2, Update 0] Average training loss = inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 2, Update 0] Average local model validation loss = 0.48891158401966095\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 2, Update 1] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:12]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mPortland evaluation loss: 0.4745386242866516\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Local evaluation loss: 0.5099506974220276\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40192.70 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 61154.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 37825.69 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 38856.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40192.70 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 61154.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 39908.28 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 42890.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.5100\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 2, Update 1] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mSeattle evaluation loss: 0.48276180028915405\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Local evaluation loss: 0.5275096893310547\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40199.68 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 60928.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 37878.81 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 38960.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40199.68 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 60928.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 39946.38 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 42922.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.5275\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling join\n",
+ "\u001b[94m[Round 2, Update 1] Average aggregated model validation loss = 0.47865021228790283\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 2, Update 1] Average training loss = inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 2, Update 1] Average local model validation loss = 0.5187301933765411\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 3, Update 0] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mPortland evaluation loss: 0.5035024285316467\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:12]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Local evaluation loss: 0.5614020824432373\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40187.96 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 60596.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 37825.69 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 38822.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40187.96 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 60596.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 39911.37 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 42870.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.5614\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 3, Update 0] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.23it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mSeattle evaluation loss: 0.5139071941375732\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Local evaluation loss: 0.5329421162605286\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40194.72 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 61068.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 37878.81 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 38914.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40194.72 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 61068.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 39942.97 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 42944.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.5329\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling join\n",
+ "\u001b[94m[Round 3, Update 0] Average aggregated model validation loss = 0.50870481133461\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 3, Update 0] Average training loss = inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 3, Update 0] Average local model validation loss = 0.5471720993518829\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 3, Update 1] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mPortland evaluation loss: 0.5240783095359802\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:12]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Local evaluation loss: 0.5784252882003784\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40185.57 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 60476.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Memory Usage by Stage:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 37825.69 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 38820.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40185.57 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 60476.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 39910.69 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 42950.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m\n",
+ "Performance Metrics:\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.5784\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
+ "\u001b[0mShould transfer from local_model_validation to join\n",
+ "\n",
+ "Calling aggregated_model_validation\n",
+ "\u001b[94m[Round 3, Update 1] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " [13/13 00:11]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mSeattle evaluation loss: 0.5348654389381409\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
"
@@ -2035,12 +10152,35 @@
"metadata": {},
"output_type": "display_data"
},
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
{
"name": "stdout",
"output_type": "stream",
"text": [
- "\n",
- "Calling local_model_validation\n"
+ "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
]
},
{
@@ -2057,6 +10197,27 @@
" warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
" warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
"\u001b[0m"
]
},
@@ -2067,7 +10228,7 @@
"
\n",
" "
],
@@ -2260,30 +10339,37 @@
"metadata": {},
"output_type": "display_data"
},
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[94mPortland value of 0.5662918090820312\u001b[0m\u001b[94m\n",
- "\u001b[0m\n",
- "Calling train\n"
- ]
- },
{
"name": "stderr",
"output_type": "stream",
"text": [
- "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
"\n",
"Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
" warnings.warn(message, FutureWarning)\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
" warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
" warnings.warn(\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mPortland evaluation loss: 0.5471141934394836\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
@@ -2296,8 +10382,8 @@
"\n",
"
"
@@ -2417,6 +10463,14 @@
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
"\u001b[0m"
]
},
@@ -2424,7 +10478,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "\n",
+ "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
"Calling local_model_validation\n"
]
},
@@ -2442,6 +10497,9 @@
" warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
" warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n",
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
@@ -2455,8 +10513,8 @@
"\n",
"
\n",
" "
],
@@ -2471,36 +10529,37 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
+ "\u001b[94m[Portland] Local evaluation loss: 0.5927156209945679\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m\n",
"==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Allocated: 75278.13 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Reserved: 93310.00 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Max Allocated: 92214.55 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40181.67 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 60874.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m\n",
"Memory Usage by Stage:\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Allocated: 72811.06 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Reserved: 91120.00 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Max Allocated: 91914.16 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 37825.69 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 38838.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Allocated: 75278.13 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Reserved: 93310.00 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Max Allocated: 92214.55 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40181.67 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 60874.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Allocated: 75250.57 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Reserved: 83158.00 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Max Allocated: 92214.55 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 39912.48 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 42850.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m\n",
"Performance Metrics:\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Training Loss: 0.3243\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Evaluation Loss: 0.2989\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.5927\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
"\u001b[0mShould transfer from local_model_validation to join\n",
"\n",
"Calling aggregated_model_validation\n",
- "\u001b[94mPerforming aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n",
+ "\u001b[94m[Round 4, Update 0] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n",
"\u001b[0m"
]
},
@@ -2508,7 +10567,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.28it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
"\n",
"Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
@@ -2519,6 +10578,23 @@
" warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
" warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
@@ -2544,30 +10620,37 @@
"metadata": {},
"output_type": "display_data"
},
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[94mSeattle value of 0.5757399201393127\u001b[0m\u001b[94m\n",
- "\u001b[0m\n",
- "Calling train\n"
- ]
- },
{
"name": "stderr",
"output_type": "stream",
"text": [
- "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
"\n",
"Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
" warnings.warn(message, FutureWarning)\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
" warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
" warnings.warn(\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mSeattle evaluation loss: 0.5571405291557312\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
@@ -2580,8 +10663,8 @@
"\n",
"
"
@@ -2701,6 +10744,14 @@
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
"\u001b[0m"
]
},
@@ -2708,8 +10759,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "\n",
- "Calling local_model_validation\n"
+ "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
]
},
{
@@ -2726,6 +10777,23 @@
" warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
" warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
@@ -2755,41 +10823,42 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n",
+ "\u001b[94m[Seattle] Local evaluation loss: 0.5642604827880859\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m\n",
"==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Allocated: 58276.76 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Reserved: 83620.00 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Max Allocated: 92551.30 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40189.73 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 61506.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m\n",
"Memory Usage by Stage:\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Allocated: 55775.01 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Reserved: 74152.00 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Max Allocated: 92551.30 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 37878.81 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 38928.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Allocated: 58145.08 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Reserved: 83620.00 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Max Allocated: 92551.30 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40189.73 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 61506.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Allocated: 58276.76 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Reserved: 65726.00 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Max Allocated: 92551.30 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 39944.91 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 43022.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m\n",
"Performance Metrics:\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Training Loss: 0.3242\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Evaluation Loss: 0.3134\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.5643\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
"\u001b[0mShould transfer from local_model_validation to join\n",
"\n",
"Calling join\n",
- "\u001b[94mAverage aggregated model validation values = 0.571015864610672\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94mAverage training loss = 0.3242545044578319\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94mAverage local model validation values = 0.30610978603363037\u001b[0m\u001b[94m\n",
+ "\u001b[94m[Round 4, Update 0] Average aggregated model validation loss = 0.5521273612976074\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 4, Update 0] Average training loss = inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m[Round 4, Update 0] Average local model validation loss = 0.5784880518913269\u001b[0m\u001b[94m\n",
"\u001b[0m\n",
"Calling aggregated_model_validation\n",
- "\u001b[94mPerforming aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n",
+ "\u001b[94m[Round 4, Update 1] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n",
"\u001b[0m"
]
},
@@ -2797,7 +10866,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.28it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
"\n",
"Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
@@ -2808,6 +10877,23 @@
" warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
" warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
@@ -2821,8 +10907,8 @@
"\n",
"
\n",
" "
],
@@ -2833,30 +10919,37 @@
"metadata": {},
"output_type": "display_data"
},
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[94mPortland value of 0.296934574842453\u001b[0m\u001b[94m\n",
- "\u001b[0m\n",
- "Calling train\n"
- ]
- },
{
"name": "stderr",
"output_type": "stream",
"text": [
- "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
"\n",
"Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
" warnings.warn(message, FutureWarning)\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
" warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
" warnings.warn(\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mPortland evaluation loss: 0.5503019094467163\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
@@ -2869,8 +10962,8 @@
"\n",
"
"
@@ -2990,6 +11043,14 @@
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
"\u001b[0m"
]
},
@@ -2997,8 +11058,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "\n",
- "Calling local_model_validation\n"
+ "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
]
},
{
@@ -3015,6 +11076,23 @@
" warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
" warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Calling local_model_validation\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
@@ -3028,8 +11106,8 @@
"\n",
"
\n",
" "
],
@@ -3044,36 +11122,37 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
+ "\u001b[94m[Portland] Local evaluation loss: 0.5799501538276672\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m\n",
"==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Allocated: 75488.79 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Reserved: 93552.00 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Max Allocated: 92552.04 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40182.25 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 60476.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m\n",
"Memory Usage by Stage:\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Allocated: 73023.56 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Reserved: 74272.00 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Max Allocated: 92551.30 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 37825.69 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 38820.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Allocated: 75488.79 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Reserved: 93552.00 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Max Allocated: 92552.04 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 40182.25 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 60476.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Allocated: 75466.38 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Reserved: 82944.00 MB\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Max Allocated: 92552.04 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Allocated: 39911.18 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Reserved: 42930.00 MB\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m\n",
"Performance Metrics:\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Training Loss: 0.2914\u001b[0m\u001b[94m\n",
- "\u001b[0m\u001b[94m Evaluation Loss: 0.2939\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n",
+ "\u001b[0m\u001b[94m Evaluation Loss: 0.5800\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n",
"\u001b[0mShould transfer from local_model_validation to join\n",
"\n",
"Calling aggregated_model_validation\n",
- "\u001b[94mPerforming aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n",
+ "\u001b[94m[Round 4, Update 1] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n",
"\u001b[0m"
]
},
@@ -3081,7 +11160,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.28it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
+ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.23it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
"\n",
"Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
@@ -3092,6 +11171,23 @@
" warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
" warnings.warn(\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+ " warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
@@ -3117,140 +11213,107 @@
"metadata": {},
"output_type": "display_data"
},
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[94mSeattle value of 0.31116044521331787\u001b[0m\u001b[94m\n",
- "\u001b[0m\n",
- "Calling train\n"
- ]
- },
{
"name": "stderr",
"output_type": "stream",
"text": [
- "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n",
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n",
"\n",
"Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
" warnings.warn(message, FutureWarning)\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
- " warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
" warnings.warn(\n",
"\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
" warnings.warn(\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
- " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
- "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
- " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94mSeattle evaluation loss: 0.5621325373649597\u001b[0m\u001b[94m\n",
+ "\u001b[0m\n",
+ "Calling train\n",
+ "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n",
"\u001b[0m"
]
},
{
- "data": {
- "text/html": [
- "\n",
- "
\n",
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+ "\u001b[0m"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "