From b8abdef826ab4b57f3d9ad59d5b05378f3397ccb Mon Sep 17 00:00:00 2001 From: rajithkrishnegowda <134698520+rajithkrishnegowda@users.noreply.github.com> Date: Thu, 24 Apr 2025 17:19:59 +0530 Subject: [PATCH 01/34] Update trivy.yml --- .github/workflows/trivy.yml | 170 +++++++++++++++++++++++++----------- 1 file changed, 119 insertions(+), 51 deletions(-) diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml index 62b196f3d1..67cf6d1fcb 100644 --- a/.github/workflows/trivy.yml +++ b/.github/workflows/trivy.yml @@ -1,4 +1,5 @@ name: Trivy Nightly Scan + on: workflow_call: inputs: @@ -7,15 +8,15 @@ on: type: string workflow_dispatch: schedule: - - cron: '0 0 * * *' # This runs the workflow every night at midnight UTC + - cron: '0 0 * * *' # Every night at midnight UTC jobs: build: if: github.event.pull_request.draft == false permissions: - contents: read # for actions/checkout to fetch code - security-events: write # for github/codeql-action/upload-sarif to upload SARIF results - actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status + contents: read + security-events: write + actions: read runs-on: ubuntu-22.04 timeout-minutes: 15 @@ -33,34 +34,29 @@ jobs: run: | curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sudo sh -s -- -b /usr/local/bin - - name: Run Trivy code vulnerability scanner (JSON Output) - run: | - trivy --quiet fs \ - --format json \ - --output trivy-code-results.json \ - --ignore-unfixed \ - --vuln-type os,library \ - --severity CRITICAL,HIGH,MEDIUM \ - . - - - name: Display Trivy code Scan Results - if: failure() # Ensure this step runs regardless of the previous step's outcome + - name: Run Trivy code vulnerability scanner (JSON) + continue-on-error: true run: | - echo "Trivy Scan Results:" - cat trivy-code-results.json | jq '.Results[] | select(.Vulnerabilities != null) | .Vulnerabilities[] | {VulnerabilityID, PkgName, InstalledVersion, Severity, Description}' + trivy fs \ + --quiet \ + --format json \ + --output trivy-code-results.json \ + --ignore-unfixed \ + --vuln-type os,library \ + --severity CRITICAL,HIGH,MEDIUM . - - name: Upload Code Vulnerability Scan Results + - name: Upload Trivy Code JSON uses: actions/upload-artifact@v4 with: - name: trivy-code-report-json + name: trivy-code-report-${{ env.COMMIT_ID }} path: trivy-code-results.json - - name: Build an image from Dockerfile + - name: Build Docker Image run: | docker build --pull -t docker.io/securefederatedai/openfl:${{ github.sha }} -f openfl-docker/Dockerfile.base . - - name: Run Trivy vulnerability scanner for Docker image (JSON Output) - id: trivy-scan + - name: Trivy scan for Docker image (JSON) + id: trivy-image-json uses: aquasecurity/trivy-action@0.30.0 with: image-ref: 'docker.io/securefederatedai/openfl:${{ github.sha }}' @@ -71,40 +67,32 @@ jobs: vuln-type: 'os,library' severity: 'CRITICAL,HIGH,MEDIUM' trivyignores: '.trivyignore' + continue-on-error: true - - name: Display Trivy Docker Scan Results - if: failure() # Ensure this step runs regardless of the previous step's outcome - run: | - if [ -s trivy-docker-results.json ]; then - echo "Trivy Scan Results:" - cat trivy-docker-results.json | jq '.Results[] | select(.Vulnerabilities != null) | .Vulnerabilities[] | {VulnerabilityID, PkgName, InstalledVersion, Severity, Description}' - else - echo "Trivy scan results file is empty or not found." - fi - - - name: Upload final Trivy Docker Vulnerability Scan + - name: Upload Trivy Docker JSON uses: actions/upload-artifact@v4 with: - name: trivy-docker-report-json + name: trivy-docker-report-${{ env.COMMIT_ID }} path: trivy-docker-results.json - - name: Run Trivy code vulnerability scanner (SPDX-JSON Output) + - name: Trivy scan for code (SPDX) + continue-on-error: true run: | - trivy --quiet fs \ - --format spdx-json \ - --output trivy-code-spdx-results.json \ - --ignore-unfixed \ - --vuln-type os,library \ - --severity CRITICAL,HIGH,MEDIUM \ - . - - - name: Upload SPDX Code Vulnerability Scan Results + trivy fs \ + --quiet \ + --format spdx-json \ + --output trivy-code-spdx-results.json \ + --ignore-unfixed \ + --vuln-type os,library \ + --severity CRITICAL,HIGH,MEDIUM . + + - name: Upload Trivy SPDX Code uses: actions/upload-artifact@v4 with: - name: trivy-code-spdx-report-json + name: trivy-code-spdx-${{ env.COMMIT_ID }} path: trivy-code-spdx-results.json - - - name: Run Trivy vulnerability scanner for Docker image (SPDX-JSON Output) + + - name: Trivy scan for Docker image (SPDX) uses: aquasecurity/trivy-action@0.30.0 with: image-ref: 'docker.io/securefederatedai/openfl:${{ github.sha }}' @@ -115,9 +103,89 @@ jobs: vuln-type: 'os,library' severity: 'CRITICAL,HIGH,MEDIUM' trivyignores: '.trivyignore' - - - name: Upload SPDX Docker Vulnerability Scan + continue-on-error: true + + - name: Upload Trivy SPDX Docker uses: actions/upload-artifact@v4 with: - name: trivy-docker-spdx-report-json + name: trivy-docker-spdx-${{ env.COMMIT_ID }} path: trivy-docker-spdx-results.json + + - name: Generate Vulnerability Summary + id: summary + if: always() + run: | + echo "## Trivy Security Scan Summary" > vulnerability-summary.md + echo "Scan Date: $(date)" >> vulnerability-summary.md + echo "Repository: ${{ github.repository }}" >> vulnerability-summary.md + echo "Commit: ${{ env.COMMIT_ID }}" >> vulnerability-summary.md + echo "" >> vulnerability-summary.md + + HAS_VULNS=false + + if [ -s trivy-code-results.json ]; then + COUNT=$(jq '[.Results[] | select(.Vulnerabilities != null) | .Vulnerabilities[]] | length' trivy-code-results.json) + if [ "$COUNT" -gt 0 ]; then + HAS_VULNS=true + echo "### Code Vulnerabilities Detected: $COUNT" >> vulnerability-summary.md + echo "| Severity | ID | Package | Version | Description |" >> vulnerability-summary.md + echo "|----------|----|---------|---------|-------------|" >> vulnerability-summary.md + jq -r '.Results[] | select(.Vulnerabilities != null) | .Vulnerabilities[] | + "| \(.Severity) | \(.VulnerabilityID) | \(.PkgName) | \(.InstalledVersion) | \(.Title) |"' trivy-code-results.json >> vulnerability-summary.md + fi + fi + + if [ -s trivy-docker-results.json ]; then + COUNT=$(jq '[.Results[] | select(.Vulnerabilities != null) | .Vulnerabilities[]] | length' trivy-docker-results.json) + if [ "$COUNT" -gt 0 ]; then + HAS_VULNS=true + echo "### Docker Image Vulnerabilities Detected: $COUNT" >> vulnerability-summary.md + echo "| Severity | ID | Package | Version | Description |" >> vulnerability-summary.md + echo "|----------|----|---------|---------|-------------|" >> vulnerability-summary.md + jq -r '.Results[] | select(.Vulnerabilities != null) | .Vulnerabilities[] | + "| \(.Severity) | \(.VulnerabilityID) | \(.PkgName) | \(.InstalledVersion) | \(.Title) |"' trivy-docker-results.json >> vulnerability-summary.md + fi + fi + + echo "has_vulnerabilities=$HAS_VULNS" >> $GITHUB_OUTPUT + cat vulnerability-summary.md + + - name: Upload Summary Markdown + uses: actions/upload-artifact@v4 + if: always() + with: + name: vulnerability-summary-${{ env.COMMIT_ID }} + path: vulnerability-summary.md + + - name: Extract Code Owners + id: codeowners + if: always() + run: | + if [ -f "CODEOWNERS" ]; then + OWNERS=$(grep -v "^#" CODEOWNERS | grep -o '@[^ ]*' | sort -u | tr '\n' ',' | sed 's/,$//') + echo "owners=$OWNERS" >> $GITHUB_OUTPUT + else + echo "owners=" >> $GITHUB_OUTPUT + fi + + - name: Send Email Notification + if: always() && (steps.summary.outputs.has_vulnerabilities == 'true' || failure()) + uses: dawidd6/action-send-mail@v3 + with: + server_address: ${{ secrets.SMTP_SERVER }} + server_port: ${{ secrets.SMTP_PORT }} + username: ${{ secrets.SMTP_USERNAME }} + password: ${{ secrets.SMTP_PASSWORD }} + subject: > + ${{ + (failure() && '🚨 Scan Failed: ') || + (steps.summary.outputs.has_vulnerabilities == 'true' && '⚠️ Vulnerabilities Found: ') || + '✅ Scan Completed: ' + }}Security Scan for ${{ github.repository }} + body: | +

View workflow run | commit

+ ${{ fileContents('vulnerability-summary.md') }} +

Download full reports from workflow artifacts.

+ to: ${{ steps.codeowners.outputs.owners || secrets.SECURITY_EMAIL_RECIPIENTS }} + from: GitHub Actions + content_type: text/html From 0ed37a7f81fc28eb3db50f34dbe0da885f34c267 Mon Sep 17 00:00:00 2001 From: rajithkrishnegowda <134698520+rajithkrishnegowda@users.noreply.github.com> Date: Thu, 24 Apr 2025 17:23:39 +0530 Subject: [PATCH 02/34] Update trivy.yml --- .github/workflows/trivy.yml | 143 ++++++++++++++++++++---------------- 1 file changed, 80 insertions(+), 63 deletions(-) diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml index 67cf6d1fcb..f4c38a6b02 100644 --- a/.github/workflows/trivy.yml +++ b/.github/workflows/trivy.yml @@ -34,29 +34,34 @@ jobs: run: | curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sudo sh -s -- -b /usr/local/bin - - name: Run Trivy code vulnerability scanner (JSON) - continue-on-error: true + - name: Run Trivy code vulnerability scanner (JSON Output) run: | - trivy fs \ - --quiet \ - --format json \ - --output trivy-code-results.json \ - --ignore-unfixed \ - --vuln-type os,library \ - --severity CRITICAL,HIGH,MEDIUM . - - - name: Upload Trivy Code JSON + trivy --quiet fs \ + --format json \ + --output trivy-code-results.json \ + --ignore-unfixed \ + --vuln-type os,library \ + --severity CRITICAL,HIGH,MEDIUM \ + . + + - name: Display Trivy code Scan Results + if: failure() + run: | + echo "Trivy Scan Results:" + cat trivy-code-results.json | jq '.Results[] | select(.Vulnerabilities != null) | .Vulnerabilities[] | {VulnerabilityID, PkgName, InstalledVersion, Severity, Description}' + + - name: Upload Code Vulnerability Scan Results uses: actions/upload-artifact@v4 with: - name: trivy-code-report-${{ env.COMMIT_ID }} + name: trivy-code-report-json path: trivy-code-results.json - - name: Build Docker Image + - name: Build an image from Dockerfile run: | docker build --pull -t docker.io/securefederatedai/openfl:${{ github.sha }} -f openfl-docker/Dockerfile.base . - - name: Trivy scan for Docker image (JSON) - id: trivy-image-json + - name: Run Trivy vulnerability scanner for Docker image (JSON Output) + id: trivy-scan uses: aquasecurity/trivy-action@0.30.0 with: image-ref: 'docker.io/securefederatedai/openfl:${{ github.sha }}' @@ -67,32 +72,40 @@ jobs: vuln-type: 'os,library' severity: 'CRITICAL,HIGH,MEDIUM' trivyignores: '.trivyignore' - continue-on-error: true - - name: Upload Trivy Docker JSON + - name: Display Trivy Docker Scan Results + if: failure() + run: | + if [ -s trivy-docker-results.json ]; then + echo "Trivy Scan Results:" + cat trivy-docker-results.json | jq '.Results[] | select(.Vulnerabilities != null) | .Vulnerabilities[] | {VulnerabilityID, PkgName, InstalledVersion, Severity, Description}' + else + echo "Trivy scan results file is empty or not found." + fi + + - name: Upload final Trivy Docker Vulnerability Scan uses: actions/upload-artifact@v4 with: - name: trivy-docker-report-${{ env.COMMIT_ID }} + name: trivy-docker-report-json path: trivy-docker-results.json - - name: Trivy scan for code (SPDX) - continue-on-error: true + - name: Run Trivy code vulnerability scanner (SPDX-JSON Output) run: | - trivy fs \ - --quiet \ - --format spdx-json \ - --output trivy-code-spdx-results.json \ - --ignore-unfixed \ - --vuln-type os,library \ - --severity CRITICAL,HIGH,MEDIUM . - - - name: Upload Trivy SPDX Code + trivy --quiet fs \ + --format spdx-json \ + --output trivy-code-spdx-results.json \ + --ignore-unfixed \ + --vuln-type os,library \ + --severity CRITICAL,HIGH,MEDIUM \ + . + + - name: Upload SPDX Code Vulnerability Scan Results uses: actions/upload-artifact@v4 with: - name: trivy-code-spdx-${{ env.COMMIT_ID }} + name: trivy-code-spdx-report-json path: trivy-code-spdx-results.json - - name: Trivy scan for Docker image (SPDX) + - name: Run Trivy vulnerability scanner for Docker image (SPDX-JSON Output) uses: aquasecurity/trivy-action@0.30.0 with: image-ref: 'docker.io/securefederatedai/openfl:${{ github.sha }}' @@ -103,12 +116,11 @@ jobs: vuln-type: 'os,library' severity: 'CRITICAL,HIGH,MEDIUM' trivyignores: '.trivyignore' - continue-on-error: true - - name: Upload Trivy SPDX Docker + - name: Upload SPDX Docker Vulnerability Scan uses: actions/upload-artifact@v4 with: - name: trivy-docker-spdx-${{ env.COMMIT_ID }} + name: trivy-docker-spdx-report-json path: trivy-docker-spdx-results.json - name: Generate Vulnerability Summary @@ -120,41 +132,38 @@ jobs: echo "Repository: ${{ github.repository }}" >> vulnerability-summary.md echo "Commit: ${{ env.COMMIT_ID }}" >> vulnerability-summary.md echo "" >> vulnerability-summary.md - - HAS_VULNS=false - + if [ -s trivy-code-results.json ]; then - COUNT=$(jq '[.Results[] | select(.Vulnerabilities != null) | .Vulnerabilities[]] | length' trivy-code-results.json) - if [ "$COUNT" -gt 0 ]; then - HAS_VULNS=true - echo "### Code Vulnerabilities Detected: $COUNT" >> vulnerability-summary.md - echo "| Severity | ID | Package | Version | Description |" >> vulnerability-summary.md - echo "|----------|----|---------|---------|-------------|" >> vulnerability-summary.md - jq -r '.Results[] | select(.Vulnerabilities != null) | .Vulnerabilities[] | - "| \(.Severity) | \(.VulnerabilityID) | \(.PkgName) | \(.InstalledVersion) | \(.Title) |"' trivy-code-results.json >> vulnerability-summary.md + VULN_COUNT=$(jq '[.Results[] | select(.Vulnerabilities != null) | .Vulnerabilities[]] | length' trivy-code-results.json) + if [ "$VULN_COUNT" -gt 0 ]; then + echo "### Code Vulnerabilities Detected: $VULN_COUNT" >> vulnerability-summary.md + echo "" >> vulnerability-summary.md + echo "| Severity | Vulnerability ID | Package | Version | Description |" >> vulnerability-summary.md + echo "|----------|------------------|---------|---------|-------------|" >> vulnerability-summary.md + jq -r '.Results[] | select(.Vulnerabilities != null) | .Vulnerabilities[] | "| \(.Severity) | \(.VulnerabilityID) | \(.PkgName) | \(.InstalledVersion) | \(.Title) |"' trivy-code-results.json >> vulnerability-summary.md + echo "has_vulnerabilities=true" >> $GITHUB_OUTPUT fi fi if [ -s trivy-docker-results.json ]; then - COUNT=$(jq '[.Results[] | select(.Vulnerabilities != null) | .Vulnerabilities[]] | length' trivy-docker-results.json) - if [ "$COUNT" -gt 0 ]; then - HAS_VULNS=true - echo "### Docker Image Vulnerabilities Detected: $COUNT" >> vulnerability-summary.md - echo "| Severity | ID | Package | Version | Description |" >> vulnerability-summary.md - echo "|----------|----|---------|---------|-------------|" >> vulnerability-summary.md - jq -r '.Results[] | select(.Vulnerabilities != null) | .Vulnerabilities[] | - "| \(.Severity) | \(.VulnerabilityID) | \(.PkgName) | \(.InstalledVersion) | \(.Title) |"' trivy-docker-results.json >> vulnerability-summary.md + VULN_COUNT=$(jq '[.Results[] | select(.Vulnerabilities != null) | .Vulnerabilities[]] | length' trivy-docker-results.json) + if [ "$VULN_COUNT" -gt 0 ]; then + echo "### Docker Image Vulnerabilities Detected: $VULN_COUNT" >> vulnerability-summary.md + echo "" >> vulnerability-summary.md + echo "| Severity | Vulnerability ID | Package | Version | Description |" >> vulnerability-summary.md + echo "|----------|------------------|---------|---------|-------------|" >> vulnerability-summary.md + jq -r '.Results[] | select(.Vulnerabilities != null) | .Vulnerabilities[] | "| \(.Severity) | \(.VulnerabilityID) | \(.PkgName) | \(.InstalledVersion) | \(.Title) |"' trivy-docker-results.json >> vulnerability-summary.md + echo "has_vulnerabilities=true" >> $GITHUB_OUTPUT fi fi - echo "has_vulnerabilities=$HAS_VULNS" >> $GITHUB_OUTPUT cat vulnerability-summary.md - - name: Upload Summary Markdown + - name: Upload Vulnerability Summary uses: actions/upload-artifact@v4 if: always() with: - name: vulnerability-summary-${{ env.COMMIT_ID }} + name: vulnerability-summary path: vulnerability-summary.md - name: Extract Code Owners @@ -164,12 +173,24 @@ jobs: if [ -f "CODEOWNERS" ]; then OWNERS=$(grep -v "^#" CODEOWNERS | grep -o '@[^ ]*' | sort -u | tr '\n' ',' | sed 's/,$//') echo "owners=$OWNERS" >> $GITHUB_OUTPUT + echo "Found code owners: $OWNERS" else + echo "No CODEOWNERS file found, using default recipients" echo "owners=" >> $GITHUB_OUTPUT fi + - name: Read vulnerability summary into output + id: read_summary + if: always() + run: | + { + echo "summary<> $GITHUB_OUTPUT + - name: Send Email Notification - if: always() && (steps.summary.outputs.has_vulnerabilities == 'true' || failure()) + if: always() uses: dawidd6/action-send-mail@v3 with: server_address: ${{ secrets.SMTP_SERVER }} @@ -177,14 +198,10 @@ jobs: username: ${{ secrets.SMTP_USERNAME }} password: ${{ secrets.SMTP_PASSWORD }} subject: > - ${{ - (failure() && '🚨 Scan Failed: ') || - (steps.summary.outputs.has_vulnerabilities == 'true' && '⚠️ Vulnerabilities Found: ') || - '✅ Scan Completed: ' - }}Security Scan for ${{ github.repository }} + ${{ steps.summary.outputs.has_vulnerabilities == 'true' && '⚠️ Vulnerabilities Found:' || '✅ Scan Completed:' }} Security Scan for ${{ github.repository }} body: |

View workflow run | commit

- ${{ fileContents('vulnerability-summary.md') }} + ${{ steps.read_summary.outputs.summary }}

Download full reports from workflow artifacts.

to: ${{ steps.codeowners.outputs.owners || secrets.SECURITY_EMAIL_RECIPIENTS }} from: GitHub Actions From 2d37918241f60c307da5cb054a1c200cd5a525a7 Mon Sep 17 00:00:00 2001 From: rajithkrishnegowda <134698520+rajithkrishnegowda@users.noreply.github.com> Date: Thu, 24 Apr 2025 17:50:41 +0530 Subject: [PATCH 03/34] Update trivy.yml --- .github/workflows/trivy.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml index f4c38a6b02..7d6b1032a0 100644 --- a/.github/workflows/trivy.yml +++ b/.github/workflows/trivy.yml @@ -199,10 +199,10 @@ jobs: password: ${{ secrets.SMTP_PASSWORD }} subject: > ${{ steps.summary.outputs.has_vulnerabilities == 'true' && '⚠️ Vulnerabilities Found:' || '✅ Scan Completed:' }} Security Scan for ${{ github.repository }} - body: | + html_body: |

View workflow run | commit

${{ steps.read_summary.outputs.summary }}

Download full reports from workflow artifacts.

to: ${{ steps.codeowners.outputs.owners || secrets.SECURITY_EMAIL_RECIPIENTS }} from: GitHub Actions - content_type: text/html + From 19d040bf95f456a57d975d4d1aae34e956bcb1c8 Mon Sep 17 00:00:00 2001 From: rajithkrishnegowda <134698520+rajithkrishnegowda@users.noreply.github.com> Date: Fri, 25 Apr 2025 11:23:04 +0530 Subject: [PATCH 04/34] Create extract_emails.py --- .github/scripts/extract_emails.py | 45 +++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 .github/scripts/extract_emails.py diff --git a/.github/scripts/extract_emails.py b/.github/scripts/extract_emails.py new file mode 100644 index 0000000000..70f17b64a6 --- /dev/null +++ b/.github/scripts/extract_emails.py @@ -0,0 +1,45 @@ +import re +import os +import sys +import json + +def extract_emails(filepath): + """ + Extract all unique email addresses from the given file. + """ + email_pattern = r'[\w.+-]+@[\w-]+\.[\w.-]+' + unique_emails = set() + + try: + with open(filepath, 'r') as file: + for line in file: + # Skip comment lines that don't contain emails + if line.strip().startswith('#') and '@' not in line: + continue + + # Find all email addresses in the line + emails = re.findall(email_pattern, line) + unique_emails.update(emails) + except Exception as e: + print(f"Error processing {filepath}: {str(e)}", file=sys.stderr) + + return sorted(unique_emails) + +if __name__ == "__main__": + # Check CODEOWNERS in standard locations + codeowners_path = None + for path in ['.github/CODEOWNERS', 'CODEOWNERS', 'docs/CODEOWNERS']: + if os.path.exists(path): + codeowners_path = path + break + + result = { + "emails": [], + "codeowners_path": codeowners_path + } + + if codeowners_path: + emails = extract_emails(codeowners_path) + result["emails"] = emails + + print(json.dumps(result)) From 025d67d0c0ac67d889b64ca11e763f73a13d0b78 Mon Sep 17 00:00:00 2001 From: rajithkrishnegowda <134698520+rajithkrishnegowda@users.noreply.github.com> Date: Fri, 25 Apr 2025 11:25:25 +0530 Subject: [PATCH 05/34] Update trivy.yml --- .github/workflows/trivy.yml | 273 ++++++++++++++++-------------------- 1 file changed, 121 insertions(+), 152 deletions(-) diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml index 7d6b1032a0..8e7c6c60ac 100644 --- a/.github/workflows/trivy.yml +++ b/.github/workflows/trivy.yml @@ -1,5 +1,4 @@ -name: Trivy Nightly Scan - +name: Trivy Nightly Security Scan on: workflow_call: inputs: @@ -8,201 +7,171 @@ on: type: string workflow_dispatch: schedule: - - cron: '0 0 * * *' # Every night at midnight UTC + - cron: '0 0 * * *' # Runs daily at midnight UTC jobs: - build: + security-scan: if: github.event.pull_request.draft == false permissions: contents: read security-events: write actions: read + packages: read + issues: write # Needed for creating issues if vulnerabilities found runs-on: ubuntu-22.04 - timeout-minutes: 15 - + timeout-minutes: 45 env: - TRIVY_DB_REPOSITORY: 'ghcr.io/aquasecurity/trivy-db,public.ecr.aws/aquasecurity/trivy-db' + TRIVY_VERSION: 0.50.1 COMMIT_ID: ${{ inputs.commit_id || github.sha }} steps: - - name: Checkout code + # ============ SETUP PHASE ============ + - name: Checkout repository uses: actions/checkout@v4 with: ref: ${{ env.COMMIT_ID }} + fetch-depth: 0 - - name: Install Trivy - run: | - curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sudo sh -s -- -b /usr/local/bin - - - name: Run Trivy code vulnerability scanner (JSON Output) - run: | - trivy --quiet fs \ - --format json \ - --output trivy-code-results.json \ - --ignore-unfixed \ - --vuln-type os,library \ - --severity CRITICAL,HIGH,MEDIUM \ - . - - - name: Display Trivy code Scan Results - if: failure() - run: | - echo "Trivy Scan Results:" - cat trivy-code-results.json | jq '.Results[] | select(.Vulnerabilities != null) | .Vulnerabilities[] | {VulnerabilityID, PkgName, InstalledVersion, Severity, Description}' - - - name: Upload Code Vulnerability Scan Results - uses: actions/upload-artifact@v4 - with: - name: trivy-code-report-json - path: trivy-code-results.json - - - name: Build an image from Dockerfile - run: | - docker build --pull -t docker.io/securefederatedai/openfl:${{ github.sha }} -f openfl-docker/Dockerfile.base . - - - name: Run Trivy vulnerability scanner for Docker image (JSON Output) - id: trivy-scan + # ============ SCANNING PHASE ============ + - name: Run security scans uses: aquasecurity/trivy-action@0.30.0 with: - image-ref: 'docker.io/securefederatedai/openfl:${{ github.sha }}' + scan-type: 'fs' format: 'json' - output: 'trivy-docker-results.json' - exit-code: '1' + output: 'trivy-fs-report.json' + severity: 'CRITICAL,HIGH' ignore-unfixed: true vuln-type: 'os,library' - severity: 'CRITICAL,HIGH,MEDIUM' - trivyignores: '.trivyignore' - - - name: Display Trivy Docker Scan Results - if: failure() - run: | - if [ -s trivy-docker-results.json ]; then - echo "Trivy Scan Results:" - cat trivy-docker-results.json | jq '.Results[] | select(.Vulnerabilities != null) | .Vulnerabilities[] | {VulnerabilityID, PkgName, InstalledVersion, Severity, Description}' - else - echo "Trivy scan results file is empty or not found." - fi - - - name: Upload final Trivy Docker Vulnerability Scan - uses: actions/upload-artifact@v4 - with: - name: trivy-docker-report-json - path: trivy-docker-results.json + security-checks: 'vuln' - - name: Run Trivy code vulnerability scanner (SPDX-JSON Output) + - name: Build Docker image run: | - trivy --quiet fs \ - --format spdx-json \ - --output trivy-code-spdx-results.json \ - --ignore-unfixed \ - --vuln-type os,library \ - --severity CRITICAL,HIGH,MEDIUM \ - . - - - name: Upload SPDX Code Vulnerability Scan Results - uses: actions/upload-artifact@v4 - with: - name: trivy-code-spdx-report-json - path: trivy-code-spdx-results.json - - - name: Run Trivy vulnerability scanner for Docker image (SPDX-JSON Output) + docker buildx build \ + --pull \ + --tag local/scan-target:${{ github.run_id }} \ + --file openfl-docker/Dockerfile.base \ + --load \ + . + + - name: Scan Docker image uses: aquasecurity/trivy-action@0.30.0 with: - image-ref: 'docker.io/securefederatedai/openfl:${{ github.sha }}' - format: 'spdx-json' - output: 'trivy-docker-spdx-results.json' - exit-code: '1' + image-ref: 'local/scan-target:${{ github.run_id }}' + format: 'json' + output: 'trivy-image-report.json' + severity: 'CRITICAL,HIGH' ignore-unfixed: true vuln-type: 'os,library' - severity: 'CRITICAL,HIGH,MEDIUM' - trivyignores: '.trivyignore' + security-checks: 'vuln' - - name: Upload SPDX Docker Vulnerability Scan - uses: actions/upload-artifact@v4 - with: - name: trivy-docker-spdx-report-json - path: trivy-docker-spdx-results.json + # ============ REPORTING PHASE ============ + - name: Generate SBOM reports + run: | + trivy fs --format spdx-json --output trivy-fs-sbom.json . + trivy image --format spdx-json --output trivy-image-sbom.json local/scan-target:${{ github.run_id }} - - name: Generate Vulnerability Summary - id: summary - if: always() + - name: Create consolidated report + id: report run: | - echo "## Trivy Security Scan Summary" > vulnerability-summary.md - echo "Scan Date: $(date)" >> vulnerability-summary.md - echo "Repository: ${{ github.repository }}" >> vulnerability-summary.md - echo "Commit: ${{ env.COMMIT_ID }}" >> vulnerability-summary.md - echo "" >> vulnerability-summary.md - - if [ -s trivy-code-results.json ]; then - VULN_COUNT=$(jq '[.Results[] | select(.Vulnerabilities != null) | .Vulnerabilities[]] | length' trivy-code-results.json) - if [ "$VULN_COUNT" -gt 0 ]; then - echo "### Code Vulnerabilities Detected: $VULN_COUNT" >> vulnerability-summary.md - echo "" >> vulnerability-summary.md - echo "| Severity | Vulnerability ID | Package | Version | Description |" >> vulnerability-summary.md - echo "|----------|------------------|---------|---------|-------------|" >> vulnerability-summary.md - jq -r '.Results[] | select(.Vulnerabilities != null) | .Vulnerabilities[] | "| \(.Severity) | \(.VulnerabilityID) | \(.PkgName) | \(.InstalledVersion) | \(.Title) |"' trivy-code-results.json >> vulnerability-summary.md - echo "has_vulnerabilities=true" >> $GITHUB_OUTPUT + # Initialize markdown report + echo "# Security Scan Report - OpenFL" > report.md + echo "**Scan Date:** $(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> report.md + echo "**Commit:** [${{ env.COMMIT_ID }}](https://github.com/rajithkrishnegowda/openfl/commit/${{ env.COMMIT_ID }})" >> report.md + echo -e "\n## Vulnerability Summary\n" >> report.md + + # Process filesystem results + if [ -f "trivy-fs-report.json" ]; then + FS_VULNS=$(jq '[.Results[]?.Vulnerabilities[]?] | length' trivy-fs-report.json || echo 0) + echo "### Filesystem Scans" >> report.md + echo "**Critical/High Vulnerabilities:** $FS_VULNS" >> report.md + + if [ "$FS_VULNS" -gt 0 ]; then + echo -e "\n| Severity | ID | Package | Version | Description |" >> report.md + echo "|----------|----|---------|---------|-------------|" >> report.md + jq -r '.Results[]?.Vulnerabilities[]? | "| \(.Severity) | \(.VulnerabilityID) | \(.PkgName) | \(.InstalledVersion) | \(.Title) |"' trivy-fs-report.json >> report.md + echo "::set-output name=has_vulnerabilities::true" fi fi - if [ -s trivy-docker-results.json ]; then - VULN_COUNT=$(jq '[.Results[] | select(.Vulnerabilities != null) | .Vulnerabilities[]] | length' trivy-docker-results.json) - if [ "$VULN_COUNT" -gt 0 ]; then - echo "### Docker Image Vulnerabilities Detected: $VULN_COUNT" >> vulnerability-summary.md - echo "" >> vulnerability-summary.md - echo "| Severity | Vulnerability ID | Package | Version | Description |" >> vulnerability-summary.md - echo "|----------|------------------|---------|---------|-------------|" >> vulnerability-summary.md - jq -r '.Results[] | select(.Vulnerabilities != null) | .Vulnerabilities[] | "| \(.Severity) | \(.VulnerabilityID) | \(.PkgName) | \(.InstalledVersion) | \(.Title) |"' trivy-docker-results.json >> vulnerability-summary.md - echo "has_vulnerabilities=true" >> $GITHUB_OUTPUT + # Process image results + if [ -f "trivy-image-report.json" ]; then + IMG_VULNS=$(jq '[.Results[]?.Vulnerabilities[]?] | length' trivy-image-report.json || echo 0) + echo -e "\n### Container Image Scans" >> report.md + echo "**Critical/High Vulnerabilities:** $IMG_VULNS" >> report.md + + if [ "$IMG_VULNS" -gt 0 ]; then + echo -e "\n| Severity | ID | Package | Version | Description |" >> report.md + echo "|----------|----|---------|---------|-------------|" >> report.md + jq -r '.Results[]?.Vulnerabilities[]? | "| \(.Severity) | \(.VulnerabilityID) | \(.PkgName) | \(.InstalledVersion) | \(.Title) |"' trivy-image-report.json >> report.md + echo "::set-output name=has_vulnerabilities::true" fi fi - cat vulnerability-summary.md + # Add artifact download links + echo -e "\n## Next Steps\n" >> report.md + echo "1. Review the full reports in the workflow artifacts" >> report.md + echo "2. Address critical vulnerabilities immediately" >> report.md + echo "3. Create GitHub issues for tracking remediation" >> report.md - - name: Upload Vulnerability Summary - uses: actions/upload-artifact@v4 - if: always() - with: - name: vulnerability-summary - path: vulnerability-summary.md + cat report.md - - name: Extract Code Owners + # ============ NOTIFICATION PHASE ============ + - name: Extract CODEOWNERS emails id: codeowners - if: always() run: | - if [ -f "CODEOWNERS" ]; then - OWNERS=$(grep -v "^#" CODEOWNERS | grep -o '@[^ ]*' | sort -u | tr '\n' ',' | sed 's/,$//') - echo "owners=$OWNERS" >> $GITHUB_OUTPUT - echo "Found code owners: $OWNERS" - else - echo "No CODEOWNERS file found, using default recipients" - echo "owners=" >> $GITHUB_OUTPUT + # Install Python if not already present + if ! command -v python &> /dev/null; then + sudo apt-get update && sudo apt-get install -y python3 fi - - name: Read vulnerability summary into output - id: read_summary - if: always() - run: | - { - echo "summary<> $GITHUB_OUTPUT - - - name: Send Email Notification - if: always() + # Run the Python script + OUTPUT=$(python .github/scripts/extract_emails.py) + echo "Extracted emails: $OUTPUT" + + # Parse JSON output and set outputs + EMAILS=$(echo "$OUTPUT" | jq -r '.emails | join(",")') + echo "emails=${EMAILS:-${{ secrets.SECURITY_EMAIL_RECIPIENTS }}}" >> $GITHUB_OUTPUT + echo "codeowners_path=$(echo "$OUTPUT" | jq -r '.codeowners_path')" >> $GITHUB_OUTPUT + + env: + PYTHONIOENCODING: utf-8 + + - name: Send email notification + if: always() && (steps.report.outputs.has_vulnerabilities == 'true' || failure()) uses: dawidd6/action-send-mail@v3 with: server_address: ${{ secrets.SMTP_SERVER }} server_port: ${{ secrets.SMTP_PORT }} username: ${{ secrets.SMTP_USERNAME }} password: ${{ secrets.SMTP_PASSWORD }} - subject: > - ${{ steps.summary.outputs.has_vulnerabilities == 'true' && '⚠️ Vulnerabilities Found:' || '✅ Scan Completed:' }} Security Scan for ${{ github.repository }} - html_body: | -

View workflow run | commit

- ${{ steps.read_summary.outputs.summary }} -

Download full reports from workflow artifacts.

- to: ${{ steps.codeowners.outputs.owners || secrets.SECURITY_EMAIL_RECIPIENTS }} - from: GitHub Actions - + subject: | + ${{ + failure() && '🚨 OpenFL Security Scan Failed' || + steps.report.outputs.has_vulnerabilities == 'true' && '⚠️ OpenFL Vulnerabilities Found' || + '✅ OpenFL Security Scan Passed' + }} + body: file://report.md + to: ${{ steps.codeowners.outputs.owners }} + from: "OpenFL Security Bot " + content_type: text/html + convert_markdown: true + + # ============ ARTIFACT UPLOADS ============ + - name: Upload scan artifacts + uses: actions/upload-artifact@v4 + with: + name: security-reports-${{ github.run_id }} + path: | + trivy-fs-report.json + trivy-image-report.json + trivy-fs-sbom.json + trivy-image-sbom.json + report.md + retention-days: 30 + + # ============ FAILURE HANDLING ============ + - name: Fail workflow if vulnerabilities found + if: steps.report.outputs.has_vulnerabilities == 'true' && github.event_name != 'schedule' + run: | + echo "::error::Critical/High vulnerabilities detected!" + exit 1 From 2d99f1e71c985a51b2aaf7ac2286f2b823c5c2d1 Mon Sep 17 00:00:00 2001 From: rajithkrishnegowda <134698520+rajithkrishnegowda@users.noreply.github.com> Date: Fri, 25 Apr 2025 11:26:10 +0530 Subject: [PATCH 06/34] Update CODEOWNERS --- CODEOWNERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CODEOWNERS b/CODEOWNERS index cb0b89fc6b..b36cb38d58 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -30,6 +30,6 @@ /scripts/ aayush.garg@intel.com giribabu.bikki@intel.com karan.shah@intel.com patrick.foley@intel.com srikanth.enugula@intel.com teodor.parvanov@intel.com # File level ownership -CODEOWNERS aayush.garg@intel.com giribabu.bikki@intel.com patrick.foley@intel.com preethi.asokan@intel.com rahul.garg@intel.com srikanth.enugula@intel.com teodor.parvanov@intel.com +CODEOWNERS akshay.pant@intel.com karan.shah@intel.com kevin.ta@intel.com noopur@intel.com patrick.foley@intel.com payal.chaurasiya@intel.com rahul.garg@intel.com rajith.krishnegowda@intel.com shailesh.pant@intel.com shailesh.tanwar@intel.com teodor.parvanov@intel.com test-requirements.txt akshay.pant@intel.com karan.shah@intel.com kevin.ta@intel.com noopur@intel.com patrick.foley@intel.com payal.chaurasiya@intel.com rahul.garg@intel.com rajith.krishnegowda@intel.com shailesh.pant@intel.com shailesh.tanwar@intel.com teodor.parvanov@intel.com From 522ad36cc150c709203b0fc0eb7aaeea880f434e Mon Sep 17 00:00:00 2001 From: rajithkrishnegowda <134698520+rajithkrishnegowda@users.noreply.github.com> Date: Fri, 25 Apr 2025 11:30:33 +0530 Subject: [PATCH 07/34] Update trivy.yml --- .github/workflows/trivy.yml | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml index 8e7c6c60ac..357a0d292a 100644 --- a/.github/workflows/trivy.yml +++ b/.github/workflows/trivy.yml @@ -144,14 +144,13 @@ jobs: server_port: ${{ secrets.SMTP_PORT }} username: ${{ secrets.SMTP_USERNAME }} password: ${{ secrets.SMTP_PASSWORD }} - subject: | - ${{ - failure() && '🚨 OpenFL Security Scan Failed' || - steps.report.outputs.has_vulnerabilities == 'true' && '⚠️ OpenFL Vulnerabilities Found' || - '✅ OpenFL Security Scan Passed' - }} + subject: ${{ + (failure() && '🚨 OpenFL Security Scan Failed') || + (steps.report.outputs.has_vulnerabilities == 'true' && '⚠️ OpenFL Vulnerabilities Found') || + '✅ OpenFL Security Scan Passed' + }} body: file://report.md - to: ${{ steps.codeowners.outputs.owners }} + to: ${{ steps.codeowners.outputs.emails }} from: "OpenFL Security Bot " content_type: text/html convert_markdown: true From eaa2bcc1bea16c18b25366df9fd2435d581e7315 Mon Sep 17 00:00:00 2001 From: rajithkrishnegowda <134698520+rajithkrishnegowda@users.noreply.github.com> Date: Fri, 25 Apr 2025 11:33:31 +0530 Subject: [PATCH 08/34] Update trivy.yml --- .github/workflows/trivy.yml | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml index 357a0d292a..124f32e725 100644 --- a/.github/workflows/trivy.yml +++ b/.github/workflows/trivy.yml @@ -17,7 +17,7 @@ jobs: security-events: write actions: read packages: read - issues: write # Needed for creating issues if vulnerabilities found + issues: write runs-on: ubuntu-22.04 timeout-minutes: 45 env: @@ -33,7 +33,7 @@ jobs: fetch-depth: 0 # ============ SCANNING PHASE ============ - - name: Run security scans + - name: Run filesystem scan uses: aquasecurity/trivy-action@0.30.0 with: scan-type: 'fs' @@ -89,7 +89,7 @@ jobs: echo -e "\n| Severity | ID | Package | Version | Description |" >> report.md echo "|----------|----|---------|---------|-------------|" >> report.md jq -r '.Results[]?.Vulnerabilities[]? | "| \(.Severity) | \(.VulnerabilityID) | \(.PkgName) | \(.InstalledVersion) | \(.Title) |"' trivy-fs-report.json >> report.md - echo "::set-output name=has_vulnerabilities::true" + echo "has_vulnerabilities=true" >> $GITHUB_OUTPUT fi fi @@ -103,7 +103,7 @@ jobs: echo -e "\n| Severity | ID | Package | Version | Description |" >> report.md echo "|----------|----|---------|---------|-------------|" >> report.md jq -r '.Results[]?.Vulnerabilities[]? | "| \(.Severity) | \(.VulnerabilityID) | \(.PkgName) | \(.InstalledVersion) | \(.Title) |"' trivy-image-report.json >> report.md - echo "::set-output name=has_vulnerabilities::true" + echo "has_vulnerabilities=true" >> $GITHUB_OUTPUT fi fi @@ -116,19 +116,27 @@ jobs: cat report.md # ============ NOTIFICATION PHASE ============ + - name: Set notification subject + id: set-subject + run: | + if [[ "${{ job.status }}" == "failure" ]]; then + echo "subject=🚨 OpenFL Security Scan Failed" >> $GITHUB_OUTPUT + elif [[ "${{ steps.report.outputs.has_vulnerabilities }}" == "true" ]]; then + echo "subject=⚠️ OpenFL Vulnerabilities Found" >> $GITHUB_OUTPUT + else + echo "subject=✅ OpenFL Security Scan Passed" >> $GITHUB_OUTPUT + fi + - name: Extract CODEOWNERS emails id: codeowners run: | - # Install Python if not already present if ! command -v python &> /dev/null; then sudo apt-get update && sudo apt-get install -y python3 fi - # Run the Python script OUTPUT=$(python .github/scripts/extract_emails.py) echo "Extracted emails: $OUTPUT" - # Parse JSON output and set outputs EMAILS=$(echo "$OUTPUT" | jq -r '.emails | join(",")') echo "emails=${EMAILS:-${{ secrets.SECURITY_EMAIL_RECIPIENTS }}}" >> $GITHUB_OUTPUT echo "codeowners_path=$(echo "$OUTPUT" | jq -r '.codeowners_path')" >> $GITHUB_OUTPUT @@ -144,11 +152,7 @@ jobs: server_port: ${{ secrets.SMTP_PORT }} username: ${{ secrets.SMTP_USERNAME }} password: ${{ secrets.SMTP_PASSWORD }} - subject: ${{ - (failure() && '🚨 OpenFL Security Scan Failed') || - (steps.report.outputs.has_vulnerabilities == 'true' && '⚠️ OpenFL Vulnerabilities Found') || - '✅ OpenFL Security Scan Passed' - }} + subject: ${{ steps.set-subject.outputs.subject }} body: file://report.md to: ${{ steps.codeowners.outputs.emails }} from: "OpenFL Security Bot " From 9601e1d77ae51a0570e26ca69b4b0734c80619d3 Mon Sep 17 00:00:00 2001 From: rajithkrishnegowda <134698520+rajithkrishnegowda@users.noreply.github.com> Date: Fri, 25 Apr 2025 11:43:55 +0530 Subject: [PATCH 09/34] Update trivy.yml --- .github/workflows/trivy.yml | 125 ++++-------------------------------- 1 file changed, 12 insertions(+), 113 deletions(-) diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml index 124f32e725..4279b4beed 100644 --- a/.github/workflows/trivy.yml +++ b/.github/workflows/trivy.yml @@ -25,95 +25,7 @@ jobs: COMMIT_ID: ${{ inputs.commit_id || github.sha }} steps: - # ============ SETUP PHASE ============ - - name: Checkout repository - uses: actions/checkout@v4 - with: - ref: ${{ env.COMMIT_ID }} - fetch-depth: 0 - - # ============ SCANNING PHASE ============ - - name: Run filesystem scan - uses: aquasecurity/trivy-action@0.30.0 - with: - scan-type: 'fs' - format: 'json' - output: 'trivy-fs-report.json' - severity: 'CRITICAL,HIGH' - ignore-unfixed: true - vuln-type: 'os,library' - security-checks: 'vuln' - - - name: Build Docker image - run: | - docker buildx build \ - --pull \ - --tag local/scan-target:${{ github.run_id }} \ - --file openfl-docker/Dockerfile.base \ - --load \ - . - - - name: Scan Docker image - uses: aquasecurity/trivy-action@0.30.0 - with: - image-ref: 'local/scan-target:${{ github.run_id }}' - format: 'json' - output: 'trivy-image-report.json' - severity: 'CRITICAL,HIGH' - ignore-unfixed: true - vuln-type: 'os,library' - security-checks: 'vuln' - - # ============ REPORTING PHASE ============ - - name: Generate SBOM reports - run: | - trivy fs --format spdx-json --output trivy-fs-sbom.json . - trivy image --format spdx-json --output trivy-image-sbom.json local/scan-target:${{ github.run_id }} - - - name: Create consolidated report - id: report - run: | - # Initialize markdown report - echo "# Security Scan Report - OpenFL" > report.md - echo "**Scan Date:** $(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> report.md - echo "**Commit:** [${{ env.COMMIT_ID }}](https://github.com/rajithkrishnegowda/openfl/commit/${{ env.COMMIT_ID }})" >> report.md - echo -e "\n## Vulnerability Summary\n" >> report.md - - # Process filesystem results - if [ -f "trivy-fs-report.json" ]; then - FS_VULNS=$(jq '[.Results[]?.Vulnerabilities[]?] | length' trivy-fs-report.json || echo 0) - echo "### Filesystem Scans" >> report.md - echo "**Critical/High Vulnerabilities:** $FS_VULNS" >> report.md - - if [ "$FS_VULNS" -gt 0 ]; then - echo -e "\n| Severity | ID | Package | Version | Description |" >> report.md - echo "|----------|----|---------|---------|-------------|" >> report.md - jq -r '.Results[]?.Vulnerabilities[]? | "| \(.Severity) | \(.VulnerabilityID) | \(.PkgName) | \(.InstalledVersion) | \(.Title) |"' trivy-fs-report.json >> report.md - echo "has_vulnerabilities=true" >> $GITHUB_OUTPUT - fi - fi - - # Process image results - if [ -f "trivy-image-report.json" ]; then - IMG_VULNS=$(jq '[.Results[]?.Vulnerabilities[]?] | length' trivy-image-report.json || echo 0) - echo -e "\n### Container Image Scans" >> report.md - echo "**Critical/High Vulnerabilities:** $IMG_VULNS" >> report.md - - if [ "$IMG_VULNS" -gt 0 ]; then - echo -e "\n| Severity | ID | Package | Version | Description |" >> report.md - echo "|----------|----|---------|---------|-------------|" >> report.md - jq -r '.Results[]?.Vulnerabilities[]? | "| \(.Severity) | \(.VulnerabilityID) | \(.PkgName) | \(.InstalledVersion) | \(.Title) |"' trivy-image-report.json >> report.md - echo "has_vulnerabilities=true" >> $GITHUB_OUTPUT - fi - fi - - # Add artifact download links - echo -e "\n## Next Steps\n" >> report.md - echo "1. Review the full reports in the workflow artifacts" >> report.md - echo "2. Address critical vulnerabilities immediately" >> report.md - echo "3. Create GitHub issues for tracking remediation" >> report.md - - cat report.md + # [Previous steps remain unchanged...] # ============ NOTIFICATION PHASE ============ - name: Set notification subject @@ -139,11 +51,17 @@ jobs: EMAILS=$(echo "$OUTPUT" | jq -r '.emails | join(",")') echo "emails=${EMAILS:-${{ secrets.SECURITY_EMAIL_RECIPIENTS }}}" >> $GITHUB_OUTPUT - echo "codeowners_path=$(echo "$OUTPUT" | jq -r '.codeowners_path')" >> $GITHUB_OUTPUT env: PYTHONIOENCODING: utf-8 + - name: Convert report to HTML + id: convert-report + run: | + pip install markdown + python -c "import markdown; print(markdown.markdown(open('report.md').read()))" > report.html + echo "html_body=$(cat report.html)" >> $GITHUB_OUTPUT + - name: Send email notification if: always() && (steps.report.outputs.has_vulnerabilities == 'true' || failure()) uses: dawidd6/action-send-mail@v3 @@ -153,28 +71,9 @@ jobs: username: ${{ secrets.SMTP_USERNAME }} password: ${{ secrets.SMTP_PASSWORD }} subject: ${{ steps.set-subject.outputs.subject }} - body: file://report.md + html_body: ${{ steps.convert-report.outputs.html_body }} to: ${{ steps.codeowners.outputs.emails }} - from: "OpenFL Security Bot " - content_type: text/html - convert_markdown: true + from: OpenFL Security Bot + convert_markdown: false - # ============ ARTIFACT UPLOADS ============ - - name: Upload scan artifacts - uses: actions/upload-artifact@v4 - with: - name: security-reports-${{ github.run_id }} - path: | - trivy-fs-report.json - trivy-image-report.json - trivy-fs-sbom.json - trivy-image-sbom.json - report.md - retention-days: 30 - - # ============ FAILURE HANDLING ============ - - name: Fail workflow if vulnerabilities found - if: steps.report.outputs.has_vulnerabilities == 'true' && github.event_name != 'schedule' - run: | - echo "::error::Critical/High vulnerabilities detected!" - exit 1 + # [Remaining steps remain unchanged...] From 7ec5b90c616349c8951774a768f742c916358396 Mon Sep 17 00:00:00 2001 From: rajithkrishnegowda <134698520+rajithkrishnegowda@users.noreply.github.com> Date: Fri, 25 Apr 2025 11:47:20 +0530 Subject: [PATCH 10/34] Update trivy.yml --- .github/workflows/trivy.yml | 110 +++++++++++++++++++++++++++++++++++- 1 file changed, 108 insertions(+), 2 deletions(-) diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml index 4279b4beed..7b550fb18a 100644 --- a/.github/workflows/trivy.yml +++ b/.github/workflows/trivy.yml @@ -25,7 +25,95 @@ jobs: COMMIT_ID: ${{ inputs.commit_id || github.sha }} steps: - # [Previous steps remain unchanged...] + # ============ SETUP PHASE ============ + - name: Checkout repository + uses: actions/checkout@v4 + with: + ref: ${{ env.COMMIT_ID }} + fetch-depth: 0 + + # ============ SCANNING PHASE ============ + - name: Run filesystem scan + uses: aquasecurity/trivy-action@0.30.0 + with: + scan-type: 'fs' + format: 'json' + output: 'trivy-fs-report.json' + severity: 'CRITICAL,HIGH' + ignore-unfixed: true + vuln-type: 'os,library' + security-checks: 'vuln' + + - name: Build Docker image + run: | + docker buildx build \ + --pull \ + --tag local/scan-target:${{ github.run_id }} \ + --file openfl-docker/Dockerfile.base \ + --load \ + . + + - name: Scan Docker image + uses: aquasecurity/trivy-action@0.30.0 + with: + image-ref: 'local/scan-target:${{ github.run_id }}' + format: 'json' + output: 'trivy-image-report.json' + severity: 'CRITICAL,HIGH' + ignore-unfixed: true + vuln-type: 'os,library' + security-checks: 'vuln' + + # ============ REPORTING PHASE ============ + - name: Generate SBOM reports + run: | + trivy fs --format spdx-json --output trivy-fs-sbom.json . + trivy image --format spdx-json --output trivy-image-sbom.json local/scan-target:${{ github.run_id }} + + - name: Create consolidated report + id: report + run: | + # Initialize markdown report + echo "# Security Scan Report - OpenFL" > report.md + echo "**Scan Date:** $(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> report.md + echo "**Commit:** [${{ env.COMMIT_ID }}](https://github.com/rajithkrishnegowda/openfl/commit/${{ env.COMMIT_ID }})" >> report.md + echo -e "\n## Vulnerability Summary\n" >> report.md + + # Process filesystem results + if [ -f "trivy-fs-report.json" ]; then + FS_VULNS=$(jq '[.Results[]?.Vulnerabilities[]?] | length' trivy-fs-report.json || echo 0) + echo "### Filesystem Scans" >> report.md + echo "**Critical/High Vulnerabilities:** $FS_VULNS" >> report.md + + if [ "$FS_VULNS" -gt 0 ]; then + echo -e "\n| Severity | ID | Package | Version | Description |" >> report.md + echo "|----------|----|---------|---------|-------------|" >> report.md + jq -r '.Results[]?.Vulnerabilities[]? | "| \(.Severity) | \(.VulnerabilityID) | \(.PkgName) | \(.InstalledVersion) | \(.Title) |"' trivy-fs-report.json >> report.md + echo "has_vulnerabilities=true" >> $GITHUB_OUTPUT + fi + fi + + # Process image results + if [ -f "trivy-image-report.json" ]; then + IMG_VULNS=$(jq '[.Results[]?.Vulnerabilities[]?] | length' trivy-image-report.json || echo 0) + echo -e "\n### Container Image Scans" >> report.md + echo "**Critical/High Vulnerabilities:** $IMG_VULNS" >> report.md + + if [ "$IMG_VULNS" -gt 0 ]; then + echo -e "\n| Severity | ID | Package | Version | Description |" >> report.md + echo "|----------|----|---------|---------|-------------|" >> report.md + jq -r '.Results[]?.Vulnerabilities[]? | "| \(.Severity) | \(.VulnerabilityID) | \(.PkgName) | \(.InstalledVersion) | \(.Title) |"' trivy-image-report.json >> report.md + echo "has_vulnerabilities=true" >> $GITHUB_OUTPUT + fi + fi + + # Add artifact download links + echo -e "\n## Next Steps\n" >> report.md + echo "1. Review the full reports in the workflow artifacts" >> report.md + echo "2. Address critical vulnerabilities immediately" >> report.md + echo "3. Create GitHub issues for tracking remediation" >> report.md + + cat report.md # ============ NOTIFICATION PHASE ============ - name: Set notification subject @@ -76,4 +164,22 @@ jobs: from: OpenFL Security Bot convert_markdown: false - # [Remaining steps remain unchanged...] + # ============ ARTIFACT UPLOADS ============ + - name: Upload scan artifacts + uses: actions/upload-artifact@v4 + with: + name: security-reports-${{ github.run_id }} + path: | + trivy-fs-report.json + trivy-image-report.json + trivy-fs-sbom.json + trivy-image-sbom.json + report.md + retention-days: 30 + + # ============ FAILURE HANDLING ============ + - name: Fail workflow if vulnerabilities found + if: steps.report.outputs.has_vulnerabilities == 'true' && github.event_name != 'schedule' + run: | + echo "::error::Critical/High vulnerabilities detected!" + exit 1 From 447c9423a23e687491ae85af46445c4c86147570 Mon Sep 17 00:00:00 2001 From: rajithkrishnegowda <134698520+rajithkrishnegowda@users.noreply.github.com> Date: Fri, 25 Apr 2025 11:55:51 +0530 Subject: [PATCH 11/34] Update trivy.yml --- .github/workflows/trivy.yml | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml index 7b550fb18a..0380bdf26b 100644 --- a/.github/workflows/trivy.yml +++ b/.github/workflows/trivy.yml @@ -143,23 +143,29 @@ jobs: env: PYTHONIOENCODING: utf-8 - - name: Convert report to HTML - id: convert-report + - name: Prepare HTML email content + id: prepare-email run: | - pip install markdown - python -c "import markdown; print(markdown.markdown(open('report.md').read()))" > report.html - echo "html_body=$(cat report.html)" >> $GITHUB_OUTPUT + # Install markdown processor + python -m pip install markdown + + # Convert markdown to HTML and properly escape for GITHUB_OUTPUT + HTML_CONTENT=$(python -c "import markdown, json; print(json.dumps(markdown.markdown(open('report.md').read())))") + echo "html_body=${HTML_CONTENT}" >> $GITHUB_OUTPUT - name: Send email notification if: always() && (steps.report.outputs.has_vulnerabilities == 'true' || failure()) uses: dawidd6/action-send-mail@v3 with: - server_address: ${{ secrets.SMTP_SERVER }} - server_port: ${{ secrets.SMTP_PORT }} - username: ${{ secrets.SMTP_USERNAME }} - password: ${{ secrets.SMTP_PASSWORD }} + # Try connection_url format if server_address fails + connection_url: ${{ secrets.SMTP_CONNECTION_URL }} # Format: "smtps://user:pass@host:port" + # Alternative if connection_url doesn't work: + # server_address: ${{ secrets.SMTP_SERVER }} + # server_port: ${{ secrets.SMTP_PORT }} + # username: ${{ secrets.SMTP_USERNAME }} + # password: ${{ secrets.SMTP_PASSWORD }} subject: ${{ steps.set-subject.outputs.subject }} - html_body: ${{ steps.convert-report.outputs.html_body }} + html_body: ${{ fromJSON(steps.prepare-email.outputs.html_body) }} to: ${{ steps.codeowners.outputs.emails }} from: OpenFL Security Bot convert_markdown: false From 68f986d127eb8d26133ee925f58dcb0c82fa7f40 Mon Sep 17 00:00:00 2001 From: rajithkrishnegowda <134698520+rajithkrishnegowda@users.noreply.github.com> Date: Fri, 25 Apr 2025 12:26:28 +0530 Subject: [PATCH 12/34] Update trivy.yml --- .github/workflows/trivy.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml index 0380bdf26b..1d69915fb8 100644 --- a/.github/workflows/trivy.yml +++ b/.github/workflows/trivy.yml @@ -158,12 +158,12 @@ jobs: uses: dawidd6/action-send-mail@v3 with: # Try connection_url format if server_address fails - connection_url: ${{ secrets.SMTP_CONNECTION_URL }} # Format: "smtps://user:pass@host:port" + #connection_url: ${{ secrets.SMTP_CONNECTION_URL }} # Format: "smtps://user:pass@host:port" # Alternative if connection_url doesn't work: - # server_address: ${{ secrets.SMTP_SERVER }} - # server_port: ${{ secrets.SMTP_PORT }} - # username: ${{ secrets.SMTP_USERNAME }} - # password: ${{ secrets.SMTP_PASSWORD }} + server_address: ${{ secrets.SMTP_SERVER }} + server_port: ${{ secrets.SMTP_PORT }} + username: ${{ secrets.SMTP_USERNAME }} + password: ${{ secrets.SMTP_PASSWORD }} subject: ${{ steps.set-subject.outputs.subject }} html_body: ${{ fromJSON(steps.prepare-email.outputs.html_body) }} to: ${{ steps.codeowners.outputs.emails }} From 88e1769276831c129645540bed29477a52120241 Mon Sep 17 00:00:00 2001 From: rajithkrishnegowda <134698520+rajithkrishnegowda@users.noreply.github.com> Date: Fri, 25 Apr 2025 12:42:33 +0530 Subject: [PATCH 13/34] Update trivy.yml --- .github/workflows/trivy.yml | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml index 1d69915fb8..5fe05b50db 100644 --- a/.github/workflows/trivy.yml +++ b/.github/workflows/trivy.yml @@ -155,20 +155,17 @@ jobs: - name: Send email notification if: always() && (steps.report.outputs.has_vulnerabilities == 'true' || failure()) - uses: dawidd6/action-send-mail@v3 + uses: actions-hub/smtp@master with: - # Try connection_url format if server_address fails - #connection_url: ${{ secrets.SMTP_CONNECTION_URL }} # Format: "smtps://user:pass@host:port" - # Alternative if connection_url doesn't work: - server_address: ${{ secrets.SMTP_SERVER }} - server_port: ${{ secrets.SMTP_PORT }} + server: ${{ secrets.SMTP_SERVER }} + port: ${{ secrets.SMTP_PORT }} username: ${{ secrets.SMTP_USERNAME }} password: ${{ secrets.SMTP_PASSWORD }} subject: ${{ steps.set-subject.outputs.subject }} html_body: ${{ fromJSON(steps.prepare-email.outputs.html_body) }} to: ${{ steps.codeowners.outputs.emails }} from: OpenFL Security Bot - convert_markdown: false + secure: true # ============ ARTIFACT UPLOADS ============ - name: Upload scan artifacts From 10e0bf4292ed522dd5453608d48cc64e79512e22 Mon Sep 17 00:00:00 2001 From: rajithkrishnegowda <134698520+rajithkrishnegowda@users.noreply.github.com> Date: Fri, 25 Apr 2025 13:04:37 +0530 Subject: [PATCH 14/34] Create send_email.py --- .github/scripts/send_email.py | 97 +++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 .github/scripts/send_email.py diff --git a/.github/scripts/send_email.py b/.github/scripts/send_email.py new file mode 100644 index 0000000000..4c151e30fc --- /dev/null +++ b/.github/scripts/send_email.py @@ -0,0 +1,97 @@ +import os +import smtplib +import logging +import argparse +from email.message import EmailMessage +from email.mime.base import MIMEBase +from email import encoders + +logger = logging.getLogger(__name__) + +def send_email(sender_email: str, to_email: str, subject: str, email_body: str, smtp_user: str, smtp_pwd: str, + smtp_email_server: str, cc_email: str = '', bcc_email: str = '', reply_email: str = '', is_html_body: bool = False, + attachments: str = '') -> None: + + message = EmailMessage() + message["Subject"] = subject + message["From"] = sender_email + if to_email: + to_list = to_email.split(",") + message["To"] = ", ".join(to_list) + if cc_email: + cc_list = cc_email.split(",") + message["Cc"] = ", ".join(cc_list) + if reply_email: + message["Reply-To"] = reply_email + sub_type = 'plain' + if is_html_body: + sub_type = 'html' + message.set_content(email_body, subtype=sub_type) + # Set up attachment if any + if attachments: + for attachment in attachments.split(','): + with open(attachment, 'rb') as attachment_file: + attachment_data = attachment_file.read() + message.add_attachment( + attachment_data, + maintype='application', + subtype='octet-stream', + filename=os.path.basename(attachment) + ) + logger.info(f'Setting smtp server {smtp_email_server}...') + smtp_server = smtplib.SMTP(smtp_email_server) + smtp_server.starttls() + smtp_server.login(smtp_user, smtp_pwd) + logger.info(f'smtp server authentication successful') + try: + logger.info(f'Sending email...') + if bcc_email: + # Send bcc list as an argument instead of adding it to the header to keep it hidden + bcc_list = bcc_email.split(",") + smtp_server.send_message(message, bcc=bcc_list) + else: + smtp_server.send_message(message) + logger.info(f'email sent.') + except Exception as ex: + raise ex + finally: + try: + smtp_server.quit() + except smtplib.SMTPServerDisconnected: + pass + finally: + logger.info("smtp connection is closed") + +def main(): + parser = argparse.ArgumentParser(description="Send an email with optional attachments") + parser.add_argument('--sender', required=True, help='Sender email address') + parser.add_argument('--to', required=True, help='Recipient email address(es) (comma-separated)') + parser.add_argument('--subject', required=True, help='Email subject') + parser.add_argument('--body', required=True, help='Email body') + parser.add_argument('--smtp-user', required=True, help='SMTP server username') + parser.add_argument('--smtp-pwd', required=True, help='SMTP server password') + parser.add_argument('--smtp-server', required=True, help='SMTP server address and port') + parser.add_argument('--cc', default='', help='CC email address(es) (comma-separated)') + parser.add_argument('--bcc', default='', help='BCC email address(es) (comma-separated)') + parser.add_argument('--reply-to', default='', help='Reply-To email address') + parser.add_argument('--html-body', action='store_true', help='Flag to indicate if email body is HTML') + parser.add_argument('--attachments', default='', help='Attachment file path(s) (space-separated)') + args = parser.parse_args() + + send_email( + sender_email=args.sender, + to_email=args.to, + subject=args.subject, + email_body=args.body, + smtp_user=args.smtp_user, + smtp_pwd=args.smtp_pwd, + smtp_email_server=args.smtp_server, + cc_email=args.cc, + bcc_email=args.bcc, + reply_email=args.reply_to, + is_html_body=args.html_body, + attachments=args.attachments + ) + +if __name__ == '__main__': + main() From 0796a834dd7e926774f00e3f13fc1fd001777b7d Mon Sep 17 00:00:00 2001 From: rajithkrishnegowda <134698520+rajithkrishnegowda@users.noreply.github.com> Date: Fri, 25 Apr 2025 13:16:06 +0530 Subject: [PATCH 15/34] Update trivy.yml --- .github/workflows/trivy.yml | 46 ++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml index 5fe05b50db..7c9b79761a 100644 --- a/.github/workflows/trivy.yml +++ b/.github/workflows/trivy.yml @@ -32,6 +32,11 @@ jobs: ref: ${{ env.COMMIT_ID }} fetch-depth: 0 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + # ============ SCANNING PHASE ============ - name: Run filesystem scan uses: aquasecurity/trivy-action@0.30.0 @@ -143,29 +148,34 @@ jobs: env: PYTHONIOENCODING: utf-8 - - name: Prepare HTML email content + - name: Prepare email content id: prepare-email run: | - # Install markdown processor + # Convert markdown to HTML python -m pip install markdown + HTML_CONTENT=$(python -c "import markdown; print(markdown.markdown(open('report.md').read()))") + echo "html_body<> $GITHUB_OUTPUT + echo "$HTML_CONTENT" >> $GITHUB_OUTPUT + echo "EOF" >> $GITHUB_OUTPUT - # Convert markdown to HTML and properly escape for GITHUB_OUTPUT - HTML_CONTENT=$(python -c "import markdown, json; print(json.dumps(markdown.markdown(open('report.md').read())))") - echo "html_body=${HTML_CONTENT}" >> $GITHUB_OUTPUT - - - name: Send email notification + - name: Send email via Python script if: always() && (steps.report.outputs.has_vulnerabilities == 'true' || failure()) - uses: actions-hub/smtp@master - with: - server: ${{ secrets.SMTP_SERVER }} - port: ${{ secrets.SMTP_PORT }} - username: ${{ secrets.SMTP_USERNAME }} - password: ${{ secrets.SMTP_PASSWORD }} - subject: ${{ steps.set-subject.outputs.subject }} - html_body: ${{ fromJSON(steps.prepare-email.outputs.html_body) }} - to: ${{ steps.codeowners.outputs.emails }} - from: OpenFL Security Bot - secure: true + env: + SMTP_SERVER: ${{ secrets.SMTP_SERVER }} + SMTP_PORT: ${{ secrets.SMTP_PORT }} + SMTP_USER: ${{ secrets.SMTP_USER }} + SMTP_PASSWORD: ${{ secrets.SMTP_PASSWORD }} + RECIPIENTS: ${{ steps.codeowners.outputs.emails }} + run: | + python .github/scripts/send_email.py \ + --sender "security@openfl.github" \ + --to "$RECIPIENTS" \ + --subject "${{ steps.set-subject.outputs.subject }}" \ + --body "${{ steps.prepare-email.outputs.html_body }}" \ + --smtp-user "$SMTP_USER" \ + --smtp-pwd "$SMTP_PASSWORD" \ + --smtp-server "$SMTP_SERVER:$SMTP_PORT" \ + --html-body # ============ ARTIFACT UPLOADS ============ - name: Upload scan artifacts From 7c3f5eb4df42a1ae16e9bd727dc0ee4c1d16df5b Mon Sep 17 00:00:00 2001 From: rajithkrishnegowda <134698520+rajithkrishnegowda@users.noreply.github.com> Date: Thu, 8 May 2025 21:05:03 +0530 Subject: [PATCH 16/34] Create phi-4-quanti.ipynb --- .../workflow/LLM/phi-4-quanti.ipynb | 705 ++++++++++++++++++ 1 file changed, 705 insertions(+) create mode 100644 openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb new file mode 100644 index 0000000000..0c6884f384 --- /dev/null +++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb @@ -0,0 +1,705 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a59f475d-d843-46bc-b75e-10984b687ed3", + "metadata": {}, + "source": [ + "# Federated Fine-Tuning of Phi-4 Using OpenFL" + ] + }, + { + "cell_type": "markdown", + "id": "20c74cb9-51a2-42e2-893f-d280e227e8bf", + "metadata": {}, + "source": [ + "\n", + "In this tutorial, we demonstrate how to fine-tune Microsoft's Phi-4 model in a federated learning workflow.\n", + "\n", + "We will fine-tune **Microsoft's [Phi4](https://huggingface.co/microsoft/phi-4)** model using a diverse dataset such as [Math_10k](https://github.com/AGI-Edgerunners/LLM-Adapters/tree/main), an open-source dataset containing mathematical question-answer pairs collected from various smaller math datasets." + ] + }, + { + "cell_type": "markdown", + "id": "d07c32d3-1a8d-4162-af45-bc3a10e0ae3f", + "metadata": {}, + "source": [ + "## The Workflow Interface" + ] + }, + { + "cell_type": "markdown", + "id": "e3d74610-e48d-4dd4-b622-eb910fbe91aa", + "metadata": {}, + "source": [ + "The workflow interface is an innovative approach to designing federated learning experiments with OpenFL. It was developed in response to discussions with researchers and users who had unique use cases that didn’t perfectly align with the traditional horizontal federated learning model. This interface enables more flexible compositions of experiments, allowing for greater customization and adaptability in complex, real-world scenarios" + ] + }, + { + "cell_type": "markdown", + "id": "413e1d95-fd76-4fe0-b8d0-4c625c2a8fd3", + "metadata": {}, + "source": [ + "## Installing OpenFL\n", + "To install OpenFL, follow the official documentation: \n", + "[OpenFL Installation Guide](https://openfl.readthedocs.io/en/latest/installation.html)" + ] + }, + { + "cell_type": "markdown", + "id": "53654c70", + "metadata": {}, + "source": [ + "After installation, activate experimental APIs using: \n", + "`fx experimental activate`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05b2ad75-8c7b-499c-902e-dbd5b24361bc", + "metadata": {}, + "outputs": [], + "source": [ + "# Install dependencies \n", + "!pip install torch transformers peft datasets trl==0.12.2 -q" + ] + }, + { + "cell_type": "markdown", + "id": "440a9c39-ec42-45a5-80f6-9a9e0bc90d2f", + "metadata": {}, + "source": [ + "## Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be4690ae-0671-4d3a-8f21-620ab865a03e", + "metadata": {}, + "outputs": [], + "source": [ + "import hashlib\n", + "import os\n", + "\n", + "import numpy as np\n", + "import requests\n", + "import torch\n", + "import transformers\n", + "from datasets import load_dataset\n", + "from peft import LoraConfig, get_peft_model\n", + "from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments\n", + "from transformers.trainer_callback import PrinterCallback\n", + "from trl import SFTTrainer\n", + "\n", + "from openfl.experimental.workflow.interface import Aggregator, Collaborator, FLSpec\n", + "from openfl.experimental.workflow.placement import aggregator, collaborator\n", + "from openfl.experimental.workflow.runtime import LocalRuntime" + ] + }, + { + "cell_type": "markdown", + "id": "08576aa0-f628-4ae6-8fc3-dd167d164784", + "metadata": {}, + "source": [ + "## Acquiring and preprocessing dataset" + ] + }, + { + "cell_type": "markdown", + "id": "7ba1d8b6-8a5b-41a2-8c77-c9a85e869cda", + "metadata": {}, + "source": [ + "We can download the dataset directly from the [LLM-Adapters repository](https://github.com/AGI-Edgerunners/LLM-Adapters)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d615d626-8727-4169-b2a6-3ba15c3cdb95", + "metadata": {}, + "outputs": [], + "source": [ + "def file_checksum(file_path, algorithm=\"sha256\"):\n", + " \"\"\"\n", + " Calculate the checksum of a file using the specified hashing algorithm.\n", + "\n", + " Parameters:\n", + " file_path (str): The path to the file for which the checksum is to be calculated.\n", + " algorithm (str): The hashing algorithm to use (default is 'sha256').\n", + "\n", + " Returns:\n", + " str: The calculated checksum of the file.\n", + " \"\"\"\n", + " hash_func = hashlib.new(algorithm)\n", + " with open(file_path, \"rb\") as f:\n", + " for chunk in iter(lambda: f.read(4096), b\"\"):\n", + " hash_func.update(chunk)\n", + " return hash_func.hexdigest()\n", + "\n", + "\n", + "if not os.path.exists(\"math_10k.json\"):\n", + " r = requests.get(\n", + " \"https://raw.githubusercontent.com/AGI-Edgerunners/LLM-Adapters/main/ft-training_set/math_10k.json\",\n", + " )\n", + " with open(\n", + " \"math_10k.json\",\n", + " \"wb\",\n", + " ) as f:\n", + " f.write(r.content)\n", + "\n", + " actual_checksum = file_checksum(\"math_10k.json\")\n", + " if (\n", + " actual_checksum\n", + " != \"0342d0d860ad8592b579329337c90e42eefd3d9f2898043140cbd120630418b8\"\n", + " ):\n", + " raise ValueError(\n", + " \"Checksum verification failed. The file may have been altered.\"\n", + " )\n", + "\n", + "raw_dataset = load_dataset(\"json\", data_files=\"math_10k.json\")" + ] + }, + { + "cell_type": "markdown", + "id": "3ab15ad6-db35-4a58-a2d5-54a6d3ccdc78", + "metadata": {}, + "source": [ + "## Initialize arguments and configurations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eada9809-468a-47c6-9b03-55aa887c9487", + "metadata": {}, + "outputs": [], + "source": [ + "training_config = {\n", + " \"bf16\": True,\n", + " \"use_cpu\": True,\n", + " \"do_eval\": False,\n", + " \"learning_rate\": 5.0e-06,\n", + " \"log_level\": \"info\",\n", + " \"logging_steps\": 20,\n", + " \"lr_scheduler_type\": \"cosine\",\n", + " \"num_train_epochs\": 1,\n", + " \"output_dir\": \"./checkpoint_dir\",\n", + " \"overwrite_output_dir\": True,\n", + " \"per_device_eval_batch_size\": 1,\n", + " \"per_device_train_batch_size\": 1,\n", + " \"save_steps\": 100,\n", + " \"save_total_limit\": 1,\n", + " \"seed\": 0,\n", + " \"gradient_checkpointing\": True,\n", + " \"gradient_checkpointing_kwargs\": {\"use_reentrant\": False},\n", + " \"warmup_ratio\": 0.2,\n", + "}\n", + "\n", + "peft_config = {\n", + " \"r\": 1,\n", + " \"lora_alpha\": 2,\n", + " \"lora_dropout\": 0.05,\n", + " \"bias\": \"none\",\n", + " \"task_type\": \"CAUSAL_LM\",\n", + " \"target_modules\": \"all-linear\",\n", + " \"modules_to_save\": None,\n", + "}\n", + "model_kwargs = dict(\n", + " use_cache=False,\n", + " trust_remote_code=True,\n", + " torch_dtype=torch.bfloat16,\n", + " device_map=None,\n", + ")\n", + "train_conf = TrainingArguments(**training_config)\n", + "peft_conf = LoraConfig(**peft_config)" + ] + }, + { + "cell_type": "markdown", + "id": "ffe93234-2a1a-4809-a431-efe2f35ce496", + "metadata": {}, + "source": [ + "## Load and initialize model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ab371f1-64c3-4225-82e7-fb3c5b05578c", + "metadata": {}, + "outputs": [], + "source": [ + "checkpoint_path = \"NyxKrage/Microsoft_Phi-4\"\n", + "model = AutoModelForCausalLM.from_pretrained(\n", + " checkpoint_path, return_dict=True, **model_kwargs\n", + ")\n", + "model = get_peft_model(model, peft_conf)\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)\n", + "sequence_max_length = 512\n", + "val_set_size = 2000\n", + "tokenizer.pad_token_id = 0 # we want this to be different from the eos token\n", + "tokenizer.padding_side = \"left\" # Allow batched inference" + ] + }, + { + "cell_type": "markdown", + "id": "dd058fff-f6dd-4cc6-acaf-7e2fa2c1132d", + "metadata": {}, + "source": [ + "## Preprocess dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4392ddab-10b7-41f6-a8e0-65ba298ea457", + "metadata": {}, + "outputs": [], + "source": [ + "def generate_prompt(data_point):\n", + " \"\"\"\n", + " Generate a prompt based on the given data point.\n", + "\n", + " Parameters:\n", + " data_point (dict): A dictionary containing the instruction, input, and output.\n", + "\n", + " Returns:\n", + " str: The generated prompt as a string.\n", + " \"\"\"\n", + " if data_point[\"input\"]:\n", + " return f\"\"\"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. \n", + "\n", + " ### Instruction:\n", + " {data_point[\"instruction\"]}\n", + " \n", + " ### Input:\n", + " {data_point[\"input\"]}\n", + " \n", + " ### Response:\n", + " {data_point[\"output\"]}\"\"\"\n", + " else:\n", + " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request. \n", + "\n", + " ### Instruction:\n", + " {data_point[\"instruction\"]}\n", + " \n", + " ### Response:\n", + " {data_point[\"output\"]}\"\"\"\n", + "\n", + "\n", + "def tokenize(prompt, add_eos_token=True):\n", + " \"\"\"\n", + " Tokenize the given prompt.\n", + "\n", + " Parameters:\n", + " prompt (str): The prompt to be tokenized.\n", + " add_eos_token (bool): Whether to add an end-of-sequence token (default is True).\n", + "\n", + " Returns:\n", + " dict: A dictionary containing the tokenized input IDs and attention mask.\n", + " \"\"\"\n", + " result = tokenizer(\n", + " prompt,\n", + " truncation=True,\n", + " max_length=sequence_max_length,\n", + " padding=False,\n", + " return_tensors=None,\n", + " )\n", + " if (\n", + " result[\"input_ids\"][-1] != tokenizer.eos_token_id\n", + " and len(result[\"input_ids\"]) < sequence_max_length\n", + " and add_eos_token\n", + " ):\n", + " result[\"input_ids\"].append(tokenizer.eos_token_id)\n", + " result[\"attention_mask\"].append(1)\n", + "\n", + " result[\"labels\"] = result[\"input_ids\"].copy()\n", + "\n", + " return result\n", + "\n", + "\n", + "def generate_and_tokenize_prompt(data_point):\n", + " \"\"\"\n", + " Generate and tokenize a prompt based on the given data point.\n", + "\n", + " Parameters:\n", + " data_point (dict): A dictionary containing the instruction, input, and output.\n", + "\n", + " Returns:\n", + " dict: A dictionary containing the tokenized input IDs, attention mask, and labels.\n", + " \"\"\"\n", + " full_prompt = generate_prompt(data_point)\n", + " tokenized_full_prompt = tokenize(full_prompt)\n", + " user_prompt = generate_prompt({**data_point, \"output\": \"\"})\n", + " tokenized_user_prompt = tokenize(user_prompt, add_eos_token=False)\n", + " user_prompt_len = len(tokenized_user_prompt[\"input_ids\"])\n", + "\n", + " tokenized_full_prompt[\"labels\"] = [-100] * user_prompt_len + tokenized_full_prompt[\n", + " \"labels\"\n", + " ][user_prompt_len:]\n", + " return tokenized_full_prompt\n", + "\n", + "\n", + "train_val = raw_dataset[\"train\"].train_test_split(\n", + " test_size=val_set_size, shuffle=True, seed=42\n", + ")\n", + "\n", + "processed_train_dataset = train_val[\"train\"].shuffle().map(generate_and_tokenize_prompt).select(range(3))\n", + "processed_test_dataset = train_val[\"test\"].shuffle().map(generate_and_tokenize_prompt).select(range(3))" + ] + }, + { + "cell_type": "markdown", + "id": "812cfcc8-33ec-4a2b-8a74-27bfc2a41d7b", + "metadata": {}, + "source": [ + "## Define Federated Averaging Method\n", + "The FedAvg method is used to average the models from all the collaborators after training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6dc85c57-68b2-4514-9373-43e3d7c05c10", + "metadata": {}, + "outputs": [], + "source": [ + "def FedAvg(peft_params, model, weights=None):\n", + " \"\"\"\n", + " Perform Federated Averaging (FedAvg) on the model parameters.\n", + "\n", + " Parameters:\n", + " peft_params (list): A list of state dictionaries containing the model parameters from different clients.\n", + " model (torch.nn.Module): The model to which the averaged parameters will be applied.\n", + " weights (list, optional): A list of weights for averaging the parameters. If None, equal weights are used.\n", + "\n", + " Returns:\n", + " torch.nn.Module: The model with the averaged parameters applied.\n", + " \"\"\"\n", + " state_dicts = peft_params\n", + " state_dict = get_peft_model_state_dict(model)\n", + " for key in peft_params[0]:\n", + " dtype = state_dicts[0][key].dtype\n", + " state_dict[key] = torch.from_numpy(\n", + " np.average(\n", + " [state[key].to(torch.float).numpy() for state in state_dicts], axis=0, weights=weights\n", + " )\n", + " ).to(dtype)\n", + " set_peft_model_state_dict(model, state_dict)\n", + " return model" + ] + }, + { + "cell_type": "markdown", + "id": "810eb75e", + "metadata": {}, + "source": [ + "Now we come to the flow definition. The OpenFL Workflow Interface adopts the conventions set by Metaflow, that every workflow begins with `start` and concludes with the `end` task. The aggregator begins with an optionally passed in model and optimizer. The aggregator begins the flow with the `start` task, where the list of collaborators is extracted from the runtime (`self.collaborators = self.runtime.collaborators`) and is then used as the list of participants to run the task listed in `self.next`, `aggregated_model_validation`. The model, optimizer, and anything that is not explicitly excluded from the next function will be passed from the `start` function on the aggregator to the `aggregated_model_validation` task on the collaborator. Where the tasks run is determined by the placement decorator that precedes each task definition (`@aggregator` or `@collaborator`). Once each of the collaborators (defined in the runtime) complete the `aggregated_model_validation` task, they pass their current state onto the `train` task, from `train` to `local_model_validation`, and then finally to `join` at the aggregator. It is in `join` that an average is taken of the model weights, and the next round can begin.\n", + "\n", + "![Workflow Interface](../../../../docs/images/workflow_interface.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58298e8e-ab9e-4377-966e-143823441697", + "metadata": {}, + "outputs": [], + "source": [ + "class FederatedFlow(FLSpec):\n", + " def __init__(self, model=None, optimizer=None, rounds=3, **kwargs):\n", + " \"\"\"\n", + " Initialize the class with the given model, optimizer, and number of rounds.\n", + "\n", + " Parameters:\n", + " model (torch.nn.Module, optional): The model to be used. If None, a ValueError is raised.\n", + " optimizer (torch.optim.Optimizer, optional): The optimizer to be used.\n", + " rounds (int, optional): The number of rounds for training or processing (default is 3).\n", + " **kwargs: Additional keyword arguments to be passed to the superclass initializer.\n", + "\n", + " Raises:\n", + " ValueError: If no model is provided.\n", + " \"\"\"\n", + " super().__init__(**kwargs)\n", + " if model is not None:\n", + " self.model = model\n", + " self.peft_params = get_peft_model_state_dict(self.model)\n", + " self.optimizer = optimizer\n", + " else:\n", + " raise ValueError(\"No model inputted\")\n", + "\n", + " self.rounds = rounds\n", + " \n", + "\n", + " @aggregator\n", + " def start(self):\n", + " \"\"\"\n", + " Initialize the model and set up the collaborators for federated learning.\n", + "\n", + " This method performs the initial setup for the model, including setting the\n", + " collaborators, initializing private variables, and starting the first round\n", + " of the federated learning process.\n", + " \"\"\"\n", + " print(f\"Performing initialization for model\")\n", + " self.collaborators = self.runtime.collaborators\n", + " self.current_round = 0\n", + " self.next(\n", + " self.aggregated_model_validation,\n", + " foreach=\"collaborators\",\n", + " )\n", + "\n", + " \n", + " @collaborator\n", + " def aggregated_model_validation(self):\n", + " \"\"\"\n", + " Perform aggregated model validation for a collaborator.\n", + "\n", + " This method loads the model, applies the PEFT configuration, and evaluates\n", + " the model using the provided training and evaluation datasets. The validation\n", + " score is then stored and the next step in the process is triggered.\n", + " \"\"\"\n", + " print(f\"Performing aggregated model validation for collaborator {self.input}\")\n", + " self.model = AutoModelForCausalLM.from_pretrained(\n", + " checkpoint_path, return_dict=True, **model_kwargs\n", + " )\n", + " self.model = get_peft_model(self.model, peft_conf)\n", + " set_peft_model_state_dict(self.model, self.peft_params)\n", + " trainer = SFTTrainer(\n", + " model=self.model,\n", + " args=train_conf,\n", + " peft_config=peft_conf,\n", + " train_dataset=self.train_dataset,\n", + " eval_dataset=self.eval_dataset,\n", + " max_seq_length=sequence_max_length,\n", + " dataset_text_field=\"text\",\n", + " tokenizer=tokenizer,\n", + " packing=True,\n", + " data_collator=transformers.DataCollatorForSeq2Seq(\n", + " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n", + " ),\n", + " )\n", + "\n", + " trainer.remove_callback(PrinterCallback)\n", + " out = trainer.evaluate()\n", + " self.agg_validation_score = out[\"eval_loss\"]\n", + " print(f\"{self.input} value of {self.agg_validation_score}\")\n", + " self.next(self.train)\n", + "\n", + " @collaborator\n", + " def train(self):\n", + " \"\"\"\n", + " Train the model for a collaborator.\n", + "\n", + " This method trains the model using the provided training and evaluation datasets.\n", + " The training loss is stored, the model is saved, and the next step in the process\n", + " is triggered.\n", + " \"\"\"\n", + " trainer = SFTTrainer(\n", + " model=self.model,\n", + " args=train_conf,\n", + " peft_config=peft_conf,\n", + " train_dataset=self.train_dataset,\n", + " eval_dataset=self.eval_dataset,\n", + " max_seq_length=sequence_max_length,\n", + " dataset_text_field=\"text\",\n", + " tokenizer=tokenizer,\n", + " packing=True,\n", + " data_collator=transformers.DataCollatorForSeq2Seq(\n", + " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n", + " ),\n", + " )\n", + "\n", + " out = trainer.train()\n", + " self.loss = out.training_loss\n", + " trainer.save_model()\n", + " self.training_completed = True\n", + " self.next(self.local_model_validation)\n", + "\n", + " @collaborator\n", + " def local_model_validation(self):\n", + " \"\"\"\n", + " Perform local model validation for a collaborator.\n", + "\n", + " This method evaluates the model using the provided training and evaluation datasets.\n", + " The validation score is stored, the PEFT parameters are updated, and the next step\n", + " in the process is triggered.\n", + " \"\"\"\n", + " trainer = SFTTrainer(\n", + " model=self.model,\n", + " args=train_conf,\n", + " peft_config=peft_conf,\n", + " train_dataset=processed_train_dataset,\n", + " eval_dataset=processed_test_dataset,\n", + " max_seq_length=sequence_max_length,\n", + " dataset_text_field=\"text\",\n", + " tokenizer=tokenizer,\n", + " packing=True,\n", + " data_collator=transformers.DataCollatorForSeq2Seq(\n", + " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n", + " ),\n", + " )\n", + " out = trainer.evaluate()\n", + " self.local_validation_score = out[\"eval_loss\"]\n", + " self.peft_params = get_peft_model_state_dict(self.model)\n", + " print(f\"Doing local model validation for collaborator {self.input}\")\n", + " self.next(self.join, exclude=[\"training_completed\", \"model\"])\n", + "\n", + " @aggregator\n", + " def join(self, inputs):\n", + " \"\"\"\n", + " Aggregate the results from all collaborators and update the model.\n", + "\n", + " This method calculates the average loss, aggregated model accuracy, and local model\n", + " accuracy from all collaborators. The model parameters are updated using Federated\n", + " Averaging (FedAvg), and the next round of the process is triggered if applicable.\n", + " \"\"\"\n", + " self.average_loss = sum(input.loss for input in inputs) / len(inputs)\n", + " self.aggregated_model_accuracy = sum(\n", + " input.agg_validation_score for input in inputs\n", + " ) / len(inputs)\n", + " self.local_model_accuracy = sum(\n", + " input.local_validation_score for input in inputs\n", + " ) / len(inputs)\n", + " print(\n", + " f\"Average aggregated model validation values = {self.aggregated_model_accuracy}\"\n", + " )\n", + " print(f\"Average training loss = {self.average_loss}\")\n", + " print(f\"Average local model validation values = {self.local_model_accuracy}\")\n", + "\n", + " self.model = FedAvg([input.peft_params for input in inputs], self.model)\n", + " self.peft_params = get_peft_model_state_dict(self.model)\n", + "\n", + " self.model.save_pretrained(\"./aggregated/model\")\n", + " tokenizer.save_pretrained(\"./aggregated/tokenizer\")\n", + " self.current_round += 1\n", + " if self.current_round < self.rounds:\n", + " self.next(\n", + " self.aggregated_model_validation,\n", + " foreach=\"collaborators\",\n", + " exclude=[\"model\"],\n", + " )\n", + " else:\n", + " self.next(self.end)\n", + "\n", + " @aggregator\n", + " def end(self):\n", + " \"\"\"\n", + " End the federated learning process.\n", + "\n", + " This method marks the end of the federated learning process and performs any\n", + " necessary cleanup or finalization steps.\n", + " \"\"\"\n", + " print(f\"This is the end of the flow\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "e120a656-f4a5-47a5-a3d4-62c5f3672bba", + "metadata": {}, + "source": [ + "You'll notice in the `FederatedFlow` definition above that there were certain attributes that the flow was not initialized with, namely the `train_dataset` and `eval_dataset` for each of the collaborators. These are **private_attributes** that are exposed only through the runtime. Each participant has its own set of private attributes: a dictionary where the key is the attribute name, and the value is the object that will be made accessible through that participant's task.\n", + "\n", + "Below, we segment shards of the Math_10k dataset for **two collaborators**: Portland and Seattle. Each has their own slice of the dataset that's accessible via the `train_dataset` or `eval_dataset` attribute. Note that the private attributes are flexible, and you can choose to pass in a completely different type of object to any of the collaborators or aggregator (with an arbitrary name). These private attributes will always be filtered out of the current state when transferring from collaborator to aggregator, or vice versa." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5e108c6-5150-4931-9c01-6b64a913fa04", + "metadata": {}, + "outputs": [], + "source": [ + "# Setup participants\n", + "_aggregator = Aggregator()\n", + "_aggregator.private_attributes = {}\n", + "\n", + "# Setup collaborators with private attributes\n", + "collaborator_names = [\n", + " \"Portland\",\n", + " \"Seattle\",\n", + "]\n", + "_collaborators = [Collaborator(name=name) for name in collaborator_names]\n", + "\n", + "for idx, current_collaborator in enumerate(_collaborators):\n", + " # Set the private attributes of the Collaborator to include their specific training and testing data loaders\n", + " current_collaborator.private_attributes = {\n", + " \"train_dataset\": processed_train_dataset.shard(\n", + " num_shards=len(_collaborators), index=idx\n", + " ),\n", + " \"eval_dataset\": processed_test_dataset.shard(\n", + " num_shards=len(_collaborators), index=idx\n", + " ),\n", + " }\n", + "\n", + "local_runtime = LocalRuntime(\n", + " aggregator=_aggregator, collaborators=_collaborators, backend=\"single_process\"\n", + ")\n", + "print(f\"Local runtime collaborators = {local_runtime.collaborators}\")" + ] + }, + { + "cell_type": "markdown", + "id": "9cb61fc0", + "metadata": {}, + "source": [ + "## Run Experiment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38894111-41d9-4dd4-b1c8-eb7ec3cdd3e1", + "metadata": {}, + "outputs": [], + "source": [ + "flflow = FederatedFlow(model, rounds=2)\n", + "flflow.runtime = local_runtime\n", + "flflow.run()\n", + "\n", + "# Determine the final model accuracy:\n", + "print(f'\\nFinal aggregated model accuracy for {flflow.rounds} rounds of training: {flflow.aggregated_model_accuracy}')" + ] + }, + { + "cell_type": "markdown", + "id": "7bc8fe27", + "metadata": {}, + "source": [ + "## 🎉 Congratulations! 🎉\n", + "\n", + "Now that you've completed this notebook, check out our [other tutorials](https://github.com/securefederatedai/openfl/tree/develop/openfl-tutorials/experimental/)\n", + "\n", + "- Using the LocalRuntime Ray Backend for dedicated GPU access\n", + "- Vertical Federated Learning\n", + "- Model Watermarking\n", + "- Differential Privacy\n", + "- And More!" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 35667d04fee574c828ee182b022c1a60c5f4fb5e Mon Sep 17 00:00:00 2001 From: rajithkrishnegowda <134698520+rajithkrishnegowda@users.noreply.github.com> Date: Mon, 12 May 2025 15:11:01 +0530 Subject: [PATCH 17/34] Update phi-4-quanti.ipynb --- .../workflow/LLM/phi-4-quanti.ipynb | 425 +++--------------- 1 file changed, 59 insertions(+), 366 deletions(-) diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb index 0c6884f384..33281dabb0 100644 --- a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb +++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb @@ -5,7 +5,7 @@ "id": "a59f475d-d843-46bc-b75e-10984b687ed3", "metadata": {}, "source": [ - "# Federated Fine-Tuning of Phi-4 Using OpenFL" + "# Federated Fine-Tuning of Phi-4 Using OpenFL with 8-bit Quantization" ] }, { @@ -13,10 +13,7 @@ "id": "20c74cb9-51a2-42e2-893f-d280e227e8bf", "metadata": {}, "source": [ - "\n", - "In this tutorial, we demonstrate how to fine-tune Microsoft's Phi-4 model in a federated learning workflow.\n", - "\n", - "We will fine-tune **Microsoft's [Phi4](https://huggingface.co/microsoft/phi-4)** model using a diverse dataset such as [Math_10k](https://github.com/AGI-Edgerunners/LLM-Adapters/tree/main), an open-source dataset containing mathematical question-answer pairs collected from various smaller math datasets." + "In this tutorial, we demonstrate how to fine-tune Microsoft's Phi-4 model in a federated learning workflow with 8-bit quantization." ] }, { @@ -24,34 +21,7 @@ "id": "d07c32d3-1a8d-4162-af45-bc3a10e0ae3f", "metadata": {}, "source": [ - "## The Workflow Interface" - ] - }, - { - "cell_type": "markdown", - "id": "e3d74610-e48d-4dd4-b622-eb910fbe91aa", - "metadata": {}, - "source": [ - "The workflow interface is an innovative approach to designing federated learning experiments with OpenFL. It was developed in response to discussions with researchers and users who had unique use cases that didn’t perfectly align with the traditional horizontal federated learning model. This interface enables more flexible compositions of experiments, allowing for greater customization and adaptability in complex, real-world scenarios" - ] - }, - { - "cell_type": "markdown", - "id": "413e1d95-fd76-4fe0-b8d0-4c625c2a8fd3", - "metadata": {}, - "source": [ - "## Installing OpenFL\n", - "To install OpenFL, follow the official documentation: \n", - "[OpenFL Installation Guide](https://openfl.readthedocs.io/en/latest/installation.html)" - ] - }, - { - "cell_type": "markdown", - "id": "53654c70", - "metadata": {}, - "source": [ - "After installation, activate experimental APIs using: \n", - "`fx experimental activate`" + "## Installation" ] }, { @@ -61,8 +31,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Install dependencies \n", - "!pip install torch transformers peft datasets trl==0.12.2 -q" + "!pip install torch transformers peft datasets trl==0.12.2 bitsandbytes accelerate -q" ] }, { @@ -70,7 +39,7 @@ "id": "440a9c39-ec42-45a5-80f6-9a9e0bc90d2f", "metadata": {}, "source": [ - "## Import libraries" + "## Import Libraries" ] }, { @@ -90,7 +59,7 @@ "from datasets import load_dataset\n", "from peft import LoraConfig, get_peft_model\n", "from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict\n", - "from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig\n", "from transformers.trainer_callback import PrinterCallback\n", "from trl import SFTTrainer\n", "\n", @@ -104,15 +73,7 @@ "id": "08576aa0-f628-4ae6-8fc3-dd167d164784", "metadata": {}, "source": [ - "## Acquiring and preprocessing dataset" - ] - }, - { - "cell_type": "markdown", - "id": "7ba1d8b6-8a5b-41a2-8c77-c9a85e869cda", - "metadata": {}, - "source": [ - "We can download the dataset directly from the [LLM-Adapters repository](https://github.com/AGI-Edgerunners/LLM-Adapters)" + "## Dataset Preparation" ] }, { @@ -123,41 +84,22 @@ "outputs": [], "source": [ "def file_checksum(file_path, algorithm=\"sha256\"):\n", - " \"\"\"\n", - " Calculate the checksum of a file using the specified hashing algorithm.\n", - "\n", - " Parameters:\n", - " file_path (str): The path to the file for which the checksum is to be calculated.\n", - " algorithm (str): The hashing algorithm to use (default is 'sha256').\n", - "\n", - " Returns:\n", - " str: The calculated checksum of the file.\n", - " \"\"\"\n", " hash_func = hashlib.new(algorithm)\n", " with open(file_path, \"rb\") as f:\n", " for chunk in iter(lambda: f.read(4096), b\"\"):\n", " hash_func.update(chunk)\n", " return hash_func.hexdigest()\n", "\n", - "\n", "if not os.path.exists(\"math_10k.json\"):\n", " r = requests.get(\n", " \"https://raw.githubusercontent.com/AGI-Edgerunners/LLM-Adapters/main/ft-training_set/math_10k.json\",\n", " )\n", - " with open(\n", - " \"math_10k.json\",\n", - " \"wb\",\n", - " ) as f:\n", + " with open(\"math_10k.json\", \"wb\") as f:\n", " f.write(r.content)\n", "\n", " actual_checksum = file_checksum(\"math_10k.json\")\n", - " if (\n", - " actual_checksum\n", - " != \"0342d0d860ad8592b579329337c90e42eefd3d9f2898043140cbd120630418b8\"\n", - " ):\n", - " raise ValueError(\n", - " \"Checksum verification failed. The file may have been altered.\"\n", - " )\n", + " if actual_checksum != \"0342d0d860ad8592b579329337c90e42eefd3d9f2898043140cbd120630418b8\":\n", + " raise ValueError(\"Checksum verification failed. The file may have been altered.\")\n", "\n", "raw_dataset = load_dataset(\"json\", data_files=\"math_10k.json\")" ] @@ -167,7 +109,7 @@ "id": "3ab15ad6-db35-4a58-a2d5-54a6d3ccdc78", "metadata": {}, "source": [ - "## Initialize arguments and configurations" + "## Configuration with 8-bit Quantization" ] }, { @@ -177,9 +119,16 @@ "metadata": {}, "outputs": [], "source": [ + "# 8-bit quantization config\n", + "quantization_config = BitsAndBytesConfig(\n", + " load_in_8bit=True,\n", + " llm_int8_threshold=6.0,\n", + " llm_int8_has_fp16_weight=False,\n", + ")\n", + "\n", "training_config = {\n", " \"bf16\": True,\n", - " \"use_cpu\": True,\n", + " \"use_cpu\": False,\n", " \"do_eval\": False,\n", " \"learning_rate\": 5.0e-06,\n", " \"log_level\": \"info\",\n", @@ -196,23 +145,26 @@ " \"gradient_checkpointing\": True,\n", " \"gradient_checkpointing_kwargs\": {\"use_reentrant\": False},\n", " \"warmup_ratio\": 0.2,\n", + " \"optim\": \"adamw_8bit\", # Special 8-bit optimizer\n", "}\n", "\n", "peft_config = {\n", - " \"r\": 1,\n", - " \"lora_alpha\": 2,\n", - " \"lora_dropout\": 0.05,\n", + " \"r\": 8,\n", + " \"lora_alpha\": 16,\n", + " \"lora_dropout\": 0.1,\n", " \"bias\": \"none\",\n", " \"task_type\": \"CAUSAL_LM\",\n", - " \"target_modules\": \"all-linear\",\n", - " \"modules_to_save\": None,\n", + " \"target_modules\": [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n", + " \"modules_to_save\": [\"lm_head\", \"embed_tokens\"],\n", "}\n", + "\n", "model_kwargs = dict(\n", " use_cache=False,\n", " trust_remote_code=True,\n", - " torch_dtype=torch.bfloat16,\n", - " device_map=None,\n", + " quantization_config=quantization_config,\n", + " device_map=\"auto\",\n", ")\n", + "\n", "train_conf = TrainingArguments(**training_config)\n", "peft_conf = LoraConfig(**peft_config)" ] @@ -222,7 +174,7 @@ "id": "ffe93234-2a1a-4809-a431-efe2f35ce496", "metadata": {}, "source": [ - "## Load and initialize model" + "## Load Quantized Model" ] }, { @@ -232,7 +184,7 @@ "metadata": {}, "outputs": [], "source": [ - "checkpoint_path = \"NyxKrage/Microsoft_Phi-4\"\n", + "checkpoint_path = \"microsoft/phi-1_5\"\n", "model = AutoModelForCausalLM.from_pretrained(\n", " checkpoint_path, return_dict=True, **model_kwargs\n", ")\n", @@ -241,8 +193,8 @@ "tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)\n", "sequence_max_length = 512\n", "val_set_size = 2000\n", - "tokenizer.pad_token_id = 0 # we want this to be different from the eos token\n", - "tokenizer.padding_side = \"left\" # Allow batched inference" + "tokenizer.pad_token_id = 0\n", + "tokenizer.padding_side = \"left\"" ] }, { @@ -250,7 +202,7 @@ "id": "dd058fff-f6dd-4cc6-acaf-7e2fa2c1132d", "metadata": {}, "source": [ - "## Preprocess dataset" + "## Dataset Preprocessing" ] }, { @@ -261,15 +213,6 @@ "outputs": [], "source": [ "def generate_prompt(data_point):\n", - " \"\"\"\n", - " Generate a prompt based on the given data point.\n", - "\n", - " Parameters:\n", - " data_point (dict): A dictionary containing the instruction, input, and output.\n", - "\n", - " Returns:\n", - " str: The generated prompt as a string.\n", - " \"\"\"\n", " if data_point[\"input\"]:\n", " return f\"\"\"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. \n", "\n", @@ -290,18 +233,7 @@ " ### Response:\n", " {data_point[\"output\"]}\"\"\"\n", "\n", - "\n", "def tokenize(prompt, add_eos_token=True):\n", - " \"\"\"\n", - " Tokenize the given prompt.\n", - "\n", - " Parameters:\n", - " prompt (str): The prompt to be tokenized.\n", - " add_eos_token (bool): Whether to add an end-of-sequence token (default is True).\n", - "\n", - " Returns:\n", - " dict: A dictionary containing the tokenized input IDs and attention mask.\n", - " \"\"\"\n", " result = tokenizer(\n", " prompt,\n", " truncation=True,\n", @@ -318,36 +250,19 @@ " result[\"attention_mask\"].append(1)\n", "\n", " result[\"labels\"] = result[\"input_ids\"].copy()\n", - "\n", " return result\n", "\n", - "\n", "def generate_and_tokenize_prompt(data_point):\n", - " \"\"\"\n", - " Generate and tokenize a prompt based on the given data point.\n", - "\n", - " Parameters:\n", - " data_point (dict): A dictionary containing the instruction, input, and output.\n", - "\n", - " Returns:\n", - " dict: A dictionary containing the tokenized input IDs, attention mask, and labels.\n", - " \"\"\"\n", " full_prompt = generate_prompt(data_point)\n", " tokenized_full_prompt = tokenize(full_prompt)\n", " user_prompt = generate_prompt({**data_point, \"output\": \"\"})\n", " tokenized_user_prompt = tokenize(user_prompt, add_eos_token=False)\n", " user_prompt_len = len(tokenized_user_prompt[\"input_ids\"])\n", "\n", - " tokenized_full_prompt[\"labels\"] = [-100] * user_prompt_len + tokenized_full_prompt[\n", - " \"labels\"\n", - " ][user_prompt_len:]\n", + " tokenized_full_prompt[\"labels\"] = [-100] * user_prompt_len + tokenized_full_prompt[\"labels\"][user_prompt_len:]\n", " return tokenized_full_prompt\n", "\n", - "\n", - "train_val = raw_dataset[\"train\"].train_test_split(\n", - " test_size=val_set_size, shuffle=True, seed=42\n", - ")\n", - "\n", + "train_val = raw_dataset[\"train\"].train_test_split(test_size=val_set_size, shuffle=True, seed=42)\n", "processed_train_dataset = train_val[\"train\"].shuffle().map(generate_and_tokenize_prompt).select(range(3))\n", "processed_test_dataset = train_val[\"test\"].shuffle().map(generate_and_tokenize_prompt).select(range(3))" ] @@ -357,8 +272,7 @@ "id": "812cfcc8-33ec-4a2b-8a74-27bfc2a41d7b", "metadata": {}, "source": [ - "## Define Federated Averaging Method\n", - "The FedAvg method is used to average the models from all the collaborators after training." + "## Federated Averaging with Quantization Support" ] }, { @@ -369,17 +283,6 @@ "outputs": [], "source": [ "def FedAvg(peft_params, model, weights=None):\n", - " \"\"\"\n", - " Perform Federated Averaging (FedAvg) on the model parameters.\n", - "\n", - " Parameters:\n", - " peft_params (list): A list of state dictionaries containing the model parameters from different clients.\n", - " model (torch.nn.Module): The model to which the averaged parameters will be applied.\n", - " weights (list, optional): A list of weights for averaging the parameters. If None, equal weights are used.\n", - "\n", - " Returns:\n", - " torch.nn.Module: The model with the averaged parameters applied.\n", - " \"\"\"\n", " state_dicts = peft_params\n", " state_dict = get_peft_model_state_dict(model)\n", " for key in peft_params[0]:\n", @@ -389,6 +292,8 @@ " [state[key].to(torch.float).numpy() for state in state_dicts], axis=0, weights=weights\n", " )\n", " ).to(dtype)\n", + " \n", + " # Handle quantization when setting state dict\n", " set_peft_model_state_dict(model, state_dict)\n", " return model" ] @@ -398,9 +303,7 @@ "id": "810eb75e", "metadata": {}, "source": [ - "Now we come to the flow definition. The OpenFL Workflow Interface adopts the conventions set by Metaflow, that every workflow begins with `start` and concludes with the `end` task. The aggregator begins with an optionally passed in model and optimizer. The aggregator begins the flow with the `start` task, where the list of collaborators is extracted from the runtime (`self.collaborators = self.runtime.collaborators`) and is then used as the list of participants to run the task listed in `self.next`, `aggregated_model_validation`. The model, optimizer, and anything that is not explicitly excluded from the next function will be passed from the `start` function on the aggregator to the `aggregated_model_validation` task on the collaborator. Where the tasks run is determined by the placement decorator that precedes each task definition (`@aggregator` or `@collaborator`). Once each of the collaborators (defined in the runtime) complete the `aggregated_model_validation` task, they pass their current state onto the `train` task, from `train` to `local_model_validation`, and then finally to `join` at the aggregator. It is in `join` that an average is taken of the model weights, and the next round can begin.\n", - "\n", - "![Workflow Interface](../../../../docs/images/workflow_interface.png)" + "## Federated Learning Workflow with Quantization" ] }, { @@ -412,62 +315,42 @@ "source": [ "class FederatedFlow(FLSpec):\n", " def __init__(self, model=None, optimizer=None, rounds=3, **kwargs):\n", - " \"\"\"\n", - " Initialize the class with the given model, optimizer, and number of rounds.\n", - "\n", - " Parameters:\n", - " model (torch.nn.Module, optional): The model to be used. If None, a ValueError is raised.\n", - " optimizer (torch.optim.Optimizer, optional): The optimizer to be used.\n", - " rounds (int, optional): The number of rounds for training or processing (default is 3).\n", - " **kwargs: Additional keyword arguments to be passed to the superclass initializer.\n", - "\n", - " Raises:\n", - " ValueError: If no model is provided.\n", - " \"\"\"\n", " super().__init__(**kwargs)\n", " if model is not None:\n", " self.model = model\n", " self.peft_params = get_peft_model_state_dict(self.model)\n", " self.optimizer = optimizer\n", + " \n", + " # Print memory usage\n", + " print(f\"Model memory footprint: {self.model.get_memory_footprint() / 1024**2:.2f} MB\")\n", + " \n", + " # Print trainable parameters\n", + " trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)\n", + " total_params = sum(p.numel() for p in self.model.parameters())\n", + " print(f\"Trainable params: {trainable_params} || All params: {total_params} || Trainable%: {100 * trainable_params / total_params:.2f}\")\n", " else:\n", " raise ValueError(\"No model inputted\")\n", "\n", " self.rounds = rounds\n", - " \n", "\n", " @aggregator\n", " def start(self):\n", - " \"\"\"\n", - " Initialize the model and set up the collaborators for federated learning.\n", - "\n", - " This method performs the initial setup for the model, including setting the\n", - " collaborators, initializing private variables, and starting the first round\n", - " of the federated learning process.\n", - " \"\"\"\n", " print(f\"Performing initialization for model\")\n", " self.collaborators = self.runtime.collaborators\n", " self.current_round = 0\n", - " self.next(\n", - " self.aggregated_model_validation,\n", - " foreach=\"collaborators\",\n", - " )\n", + " self.next(self.aggregated_model_validation, foreach=\"collaborators\")\n", "\n", - " \n", " @collaborator\n", " def aggregated_model_validation(self):\n", - " \"\"\"\n", - " Perform aggregated model validation for a collaborator.\n", - "\n", - " This method loads the model, applies the PEFT configuration, and evaluates\n", - " the model using the provided training and evaluation datasets. The validation\n", - " score is then stored and the next step in the process is triggered.\n", - " \"\"\"\n", " print(f\"Performing aggregated model validation for collaborator {self.input}\")\n", + " \n", + " # Load model with 8-bit quantization\n", " self.model = AutoModelForCausalLM.from_pretrained(\n", " checkpoint_path, return_dict=True, **model_kwargs\n", " )\n", " self.model = get_peft_model(self.model, peft_conf)\n", " set_peft_model_state_dict(self.model, self.peft_params)\n", + " \n", " trainer = SFTTrainer(\n", " model=self.model,\n", " args=train_conf,\n", @@ -491,13 +374,12 @@ "\n", " @collaborator\n", " def train(self):\n", - " \"\"\"\n", - " Train the model for a collaborator.\n", - "\n", - " This method trains the model using the provided training and evaluation datasets.\n", - " The training loss is stored, the model is saved, and the next step in the process\n", - " is triggered.\n", - " \"\"\"\n", + " print(f\"Training on collaborator {self.input}\")\n", + " \n", + " # Enable gradient checkpointing for memory efficiency\n", + " self.model.gradient_checkpointing_enable()\n", + " self.model.config.use_cache = False\n", + " \n", " trainer = SFTTrainer(\n", " model=self.model,\n", " args=train_conf,\n", @@ -513,193 +395,4 @@ " ),\n", " )\n", "\n", - " out = trainer.train()\n", - " self.loss = out.training_loss\n", - " trainer.save_model()\n", - " self.training_completed = True\n", - " self.next(self.local_model_validation)\n", - "\n", - " @collaborator\n", - " def local_model_validation(self):\n", - " \"\"\"\n", - " Perform local model validation for a collaborator.\n", - "\n", - " This method evaluates the model using the provided training and evaluation datasets.\n", - " The validation score is stored, the PEFT parameters are updated, and the next step\n", - " in the process is triggered.\n", - " \"\"\"\n", - " trainer = SFTTrainer(\n", - " model=self.model,\n", - " args=train_conf,\n", - " peft_config=peft_conf,\n", - " train_dataset=processed_train_dataset,\n", - " eval_dataset=processed_test_dataset,\n", - " max_seq_length=sequence_max_length,\n", - " dataset_text_field=\"text\",\n", - " tokenizer=tokenizer,\n", - " packing=True,\n", - " data_collator=transformers.DataCollatorForSeq2Seq(\n", - " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n", - " ),\n", - " )\n", - " out = trainer.evaluate()\n", - " self.local_validation_score = out[\"eval_loss\"]\n", - " self.peft_params = get_peft_model_state_dict(self.model)\n", - " print(f\"Doing local model validation for collaborator {self.input}\")\n", - " self.next(self.join, exclude=[\"training_completed\", \"model\"])\n", - "\n", - " @aggregator\n", - " def join(self, inputs):\n", - " \"\"\"\n", - " Aggregate the results from all collaborators and update the model.\n", - "\n", - " This method calculates the average loss, aggregated model accuracy, and local model\n", - " accuracy from all collaborators. The model parameters are updated using Federated\n", - " Averaging (FedAvg), and the next round of the process is triggered if applicable.\n", - " \"\"\"\n", - " self.average_loss = sum(input.loss for input in inputs) / len(inputs)\n", - " self.aggregated_model_accuracy = sum(\n", - " input.agg_validation_score for input in inputs\n", - " ) / len(inputs)\n", - " self.local_model_accuracy = sum(\n", - " input.local_validation_score for input in inputs\n", - " ) / len(inputs)\n", - " print(\n", - " f\"Average aggregated model validation values = {self.aggregated_model_accuracy}\"\n", - " )\n", - " print(f\"Average training loss = {self.average_loss}\")\n", - " print(f\"Average local model validation values = {self.local_model_accuracy}\")\n", - "\n", - " self.model = FedAvg([input.peft_params for input in inputs], self.model)\n", - " self.peft_params = get_peft_model_state_dict(self.model)\n", - "\n", - " self.model.save_pretrained(\"./aggregated/model\")\n", - " tokenizer.save_pretrained(\"./aggregated/tokenizer\")\n", - " self.current_round += 1\n", - " if self.current_round < self.rounds:\n", - " self.next(\n", - " self.aggregated_model_validation,\n", - " foreach=\"collaborators\",\n", - " exclude=[\"model\"],\n", - " )\n", - " else:\n", - " self.next(self.end)\n", - "\n", - " @aggregator\n", - " def end(self):\n", - " \"\"\"\n", - " End the federated learning process.\n", - "\n", - " This method marks the end of the federated learning process and performs any\n", - " necessary cleanup or finalization steps.\n", - " \"\"\"\n", - " print(f\"This is the end of the flow\")\n" - ] - }, - { - "cell_type": "markdown", - "id": "e120a656-f4a5-47a5-a3d4-62c5f3672bba", - "metadata": {}, - "source": [ - "You'll notice in the `FederatedFlow` definition above that there were certain attributes that the flow was not initialized with, namely the `train_dataset` and `eval_dataset` for each of the collaborators. These are **private_attributes** that are exposed only through the runtime. Each participant has its own set of private attributes: a dictionary where the key is the attribute name, and the value is the object that will be made accessible through that participant's task.\n", - "\n", - "Below, we segment shards of the Math_10k dataset for **two collaborators**: Portland and Seattle. Each has their own slice of the dataset that's accessible via the `train_dataset` or `eval_dataset` attribute. Note that the private attributes are flexible, and you can choose to pass in a completely different type of object to any of the collaborators or aggregator (with an arbitrary name). These private attributes will always be filtered out of the current state when transferring from collaborator to aggregator, or vice versa." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e5e108c6-5150-4931-9c01-6b64a913fa04", - "metadata": {}, - "outputs": [], - "source": [ - "# Setup participants\n", - "_aggregator = Aggregator()\n", - "_aggregator.private_attributes = {}\n", - "\n", - "# Setup collaborators with private attributes\n", - "collaborator_names = [\n", - " \"Portland\",\n", - " \"Seattle\",\n", - "]\n", - "_collaborators = [Collaborator(name=name) for name in collaborator_names]\n", - "\n", - "for idx, current_collaborator in enumerate(_collaborators):\n", - " # Set the private attributes of the Collaborator to include their specific training and testing data loaders\n", - " current_collaborator.private_attributes = {\n", - " \"train_dataset\": processed_train_dataset.shard(\n", - " num_shards=len(_collaborators), index=idx\n", - " ),\n", - " \"eval_dataset\": processed_test_dataset.shard(\n", - " num_shards=len(_collaborators), index=idx\n", - " ),\n", - " }\n", - "\n", - "local_runtime = LocalRuntime(\n", - " aggregator=_aggregator, collaborators=_collaborators, backend=\"single_process\"\n", - ")\n", - "print(f\"Local runtime collaborators = {local_runtime.collaborators}\")" - ] - }, - { - "cell_type": "markdown", - "id": "9cb61fc0", - "metadata": {}, - "source": [ - "## Run Experiment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "38894111-41d9-4dd4-b1c8-eb7ec3cdd3e1", - "metadata": {}, - "outputs": [], - "source": [ - "flflow = FederatedFlow(model, rounds=2)\n", - "flflow.runtime = local_runtime\n", - "flflow.run()\n", - "\n", - "# Determine the final model accuracy:\n", - "print(f'\\nFinal aggregated model accuracy for {flflow.rounds} rounds of training: {flflow.aggregated_model_accuracy}')" - ] - }, - { - "cell_type": "markdown", - "id": "7bc8fe27", - "metadata": {}, - "source": [ - "## 🎉 Congratulations! 🎉\n", - "\n", - "Now that you've completed this notebook, check out our [other tutorials](https://github.com/securefederatedai/openfl/tree/develop/openfl-tutorials/experimental/)\n", - "\n", - "- Using the LocalRuntime Ray Backend for dedicated GPU access\n", - "- Vertical Federated Learning\n", - "- Model Watermarking\n", - "- Differential Privacy\n", - "- And More!" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + " out From 633c27dbbe2cd0e1076641f7000ca9f5f81e2d2b Mon Sep 17 00:00:00 2001 From: rajithkrishnegowda <134698520+rajithkrishnegowda@users.noreply.github.com> Date: Mon, 12 May 2025 15:11:17 +0530 Subject: [PATCH 18/34] Update phi-4-quanti.ipynb --- openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb index 33281dabb0..960e79d1a5 100644 --- a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb +++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb @@ -395,4 +395,4 @@ " ),\n", " )\n", "\n", - " out + " From c7a9b42ecc2a4ee6b2d70a2fd9c8fc04aca2121a Mon Sep 17 00:00:00 2001 From: rajithkrishnegowda <134698520+rajithkrishnegowda@users.noreply.github.com> Date: Mon, 12 May 2025 15:11:50 +0530 Subject: [PATCH 19/34] Update phi-4-quanti.ipynb --- .../workflow/LLM/phi-4-quanti.ipynb | 100 +----------------- 1 file changed, 1 insertion(+), 99 deletions(-) diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb index 960e79d1a5..c76ecd5f51 100644 --- a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb +++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb @@ -297,102 +297,4 @@ " set_peft_model_state_dict(model, state_dict)\n", " return model" ] - }, - { - "cell_type": "markdown", - "id": "810eb75e", - "metadata": {}, - "source": [ - "## Federated Learning Workflow with Quantization" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "58298e8e-ab9e-4377-966e-143823441697", - "metadata": {}, - "outputs": [], - "source": [ - "class FederatedFlow(FLSpec):\n", - " def __init__(self, model=None, optimizer=None, rounds=3, **kwargs):\n", - " super().__init__(**kwargs)\n", - " if model is not None:\n", - " self.model = model\n", - " self.peft_params = get_peft_model_state_dict(self.model)\n", - " self.optimizer = optimizer\n", - " \n", - " # Print memory usage\n", - " print(f\"Model memory footprint: {self.model.get_memory_footprint() / 1024**2:.2f} MB\")\n", - " \n", - " # Print trainable parameters\n", - " trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)\n", - " total_params = sum(p.numel() for p in self.model.parameters())\n", - " print(f\"Trainable params: {trainable_params} || All params: {total_params} || Trainable%: {100 * trainable_params / total_params:.2f}\")\n", - " else:\n", - " raise ValueError(\"No model inputted\")\n", - "\n", - " self.rounds = rounds\n", - "\n", - " @aggregator\n", - " def start(self):\n", - " print(f\"Performing initialization for model\")\n", - " self.collaborators = self.runtime.collaborators\n", - " self.current_round = 0\n", - " self.next(self.aggregated_model_validation, foreach=\"collaborators\")\n", - "\n", - " @collaborator\n", - " def aggregated_model_validation(self):\n", - " print(f\"Performing aggregated model validation for collaborator {self.input}\")\n", - " \n", - " # Load model with 8-bit quantization\n", - " self.model = AutoModelForCausalLM.from_pretrained(\n", - " checkpoint_path, return_dict=True, **model_kwargs\n", - " )\n", - " self.model = get_peft_model(self.model, peft_conf)\n", - " set_peft_model_state_dict(self.model, self.peft_params)\n", - " \n", - " trainer = SFTTrainer(\n", - " model=self.model,\n", - " args=train_conf,\n", - " peft_config=peft_conf,\n", - " train_dataset=self.train_dataset,\n", - " eval_dataset=self.eval_dataset,\n", - " max_seq_length=sequence_max_length,\n", - " dataset_text_field=\"text\",\n", - " tokenizer=tokenizer,\n", - " packing=True,\n", - " data_collator=transformers.DataCollatorForSeq2Seq(\n", - " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n", - " ),\n", - " )\n", - "\n", - " trainer.remove_callback(PrinterCallback)\n", - " out = trainer.evaluate()\n", - " self.agg_validation_score = out[\"eval_loss\"]\n", - " print(f\"{self.input} value of {self.agg_validation_score}\")\n", - " self.next(self.train)\n", - "\n", - " @collaborator\n", - " def train(self):\n", - " print(f\"Training on collaborator {self.input}\")\n", - " \n", - " # Enable gradient checkpointing for memory efficiency\n", - " self.model.gradient_checkpointing_enable()\n", - " self.model.config.use_cache = False\n", - " \n", - " trainer = SFTTrainer(\n", - " model=self.model,\n", - " args=train_conf,\n", - " peft_config=peft_conf,\n", - " train_dataset=self.train_dataset,\n", - " eval_dataset=self.eval_dataset,\n", - " max_seq_length=sequence_max_length,\n", - " dataset_text_field=\"text\",\n", - " tokenizer=tokenizer,\n", - " packing=True,\n", - " data_collator=transformers.DataCollatorForSeq2Seq(\n", - " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n", - " ),\n", - " )\n", - "\n", - " + } From 4067c82e8f5f5f486010d51956434e1f5e607c3d Mon Sep 17 00:00:00 2001 From: rajithkrishnegowda <134698520+rajithkrishnegowda@users.noreply.github.com> Date: Mon, 12 May 2025 15:31:03 +0530 Subject: [PATCH 20/34] Update phi-4-quanti.ipynb --- .../workflow/LLM/phi-4-quanti.ipynb | 405 +++++++++--------- 1 file changed, 205 insertions(+), 200 deletions(-) diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb index c76ecd5f51..8435e9e418 100644 --- a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb +++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb @@ -5,7 +5,7 @@ "id": "a59f475d-d843-46bc-b75e-10984b687ed3", "metadata": {}, "source": [ - "# Federated Fine-Tuning of Phi-4 Using OpenFL with 8-bit Quantization" + "# Federated Fine-Tuning of Phi-4 with 8-bit Quantization" ] }, { @@ -13,15 +13,7 @@ "id": "20c74cb9-51a2-42e2-893f-d280e227e8bf", "metadata": {}, "source": [ - "In this tutorial, we demonstrate how to fine-tune Microsoft's Phi-4 model in a federated learning workflow with 8-bit quantization." - ] - }, - { - "cell_type": "markdown", - "id": "d07c32d3-1a8d-4162-af45-bc3a10e0ae3f", - "metadata": {}, - "source": [ - "## Installation" + "This notebook demonstrates federated fine-tuning of Microsoft's Phi-4 model with 8-bit quantization using OpenFL." ] }, { @@ -34,14 +26,6 @@ "!pip install torch transformers peft datasets trl==0.12.2 bitsandbytes accelerate -q" ] }, - { - "cell_type": "markdown", - "id": "440a9c39-ec42-45a5-80f6-9a9e0bc90d2f", - "metadata": {}, - "source": [ - "## Import Libraries" - ] - }, { "cell_type": "code", "execution_count": null, @@ -49,33 +33,23 @@ "metadata": {}, "outputs": [], "source": [ - "import hashlib\n", - "import os\n", - "\n", - "import numpy as np\n", - "import requests\n", "import torch\n", "import transformers\n", - "from datasets import load_dataset\n", + "from transformers import (\n", + " AutoModelForCausalLM,\n", + " AutoTokenizer,\n", + " BitsAndBytesConfig,\n", + " TrainingArguments\n", + ")\n", "from peft import LoraConfig, get_peft_model\n", - "from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict\n", - "from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig\n", - "from transformers.trainer_callback import PrinterCallback\n", + "from datasets import load_dataset\n", "from trl import SFTTrainer\n", - "\n", + "import numpy as np\n", "from openfl.experimental.workflow.interface import Aggregator, Collaborator, FLSpec\n", "from openfl.experimental.workflow.placement import aggregator, collaborator\n", "from openfl.experimental.workflow.runtime import LocalRuntime" ] }, - { - "cell_type": "markdown", - "id": "08576aa0-f628-4ae6-8fc3-dd167d164784", - "metadata": {}, - "source": [ - "## Dataset Preparation" - ] - }, { "cell_type": "code", "execution_count": null, @@ -83,218 +57,249 @@ "metadata": {}, "outputs": [], "source": [ - "def file_checksum(file_path, algorithm=\"sha256\"):\n", - " hash_func = hashlib.new(algorithm)\n", - " with open(file_path, \"rb\") as f:\n", - " for chunk in iter(lambda: f.read(4096), b\"\"):\n", - " hash_func.update(chunk)\n", - " return hash_func.hexdigest()\n", + "# 8-bit quantization config\n", + "quant_config = BitsAndBytesConfig(\n", + " load_in_8bit=True,\n", + " llm_int8_threshold=6.0,\n", + " llm_int8_skip_modules=None,\n", + " llm_int8_enable_fp32_cpu_offload=False,\n", + " llm_int8_has_fp16_weight=False\n", + ")\n", "\n", - "if not os.path.exists(\"math_10k.json\"):\n", - " r = requests.get(\n", - " \"https://raw.githubusercontent.com/AGI-Edgerunners/LLM-Adapters/main/ft-training_set/math_10k.json\",\n", - " )\n", - " with open(\"math_10k.json\", \"wb\") as f:\n", - " f.write(r.content)\n", + "# Model config\n", + "model_kwargs = {\n", + " \"quantization_config\": quant_config,\n", + " \"device_map\": \"auto\",\n", + " \"trust_remote_code\": True\n", + "}\n", "\n", - " actual_checksum = file_checksum(\"math_10k.json\")\n", - " if actual_checksum != \"0342d0d860ad8592b579329337c90e42eefd3d9f2898043140cbd120630418b8\":\n", - " raise ValueError(\"Checksum verification failed. The file may have been altered.\")\n", + "# PEFT config\n", + "peft_config = LoraConfig(\n", + " r=8,\n", + " lora_alpha=16,\n", + " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\"],\n", + " lora_dropout=0.05,\n", + " bias=\"none\",\n", + " task_type=\"CAUSAL_LM\"\n", + ")\n", "\n", - "raw_dataset = load_dataset(\"json\", data_files=\"math_10k.json\")" + "# Training config\n", + "training_config = TrainingArguments(\n", + " output_dir=\"./results\",\n", + " per_device_train_batch_size=2,\n", + " per_device_eval_batch_size=2,\n", + " gradient_accumulation_steps=4,\n", + " learning_rate=2e-5,\n", + " logging_steps=10,\n", + " num_train_epochs=1,\n", + " max_grad_norm=0.3,\n", + " warmup_ratio=0.03,\n", + " lr_scheduler_type=\"cosine\",\n", + " save_steps=100,\n", + " fp16=True,\n", + " optim=\"adamw_torch\",\n", + " report_to=\"none\"\n", + ")" ] }, { - "cell_type": "markdown", - "id": "3ab15ad6-db35-4a58-a2d5-54a6d3ccdc78", + "cell_type": "code", + "execution_count": null, + "id": "8ab371f1-64c3-4225-82e7-fb3c5b05578c", "metadata": {}, + "outputs": [], "source": [ - "## Configuration with 8-bit Quantization" + "# Load model and tokenizer\n", + "model = AutoModelForCausalLM.from_pretrained(\"microsoft/phi-2\", **model_kwargs)\n", + "model = get_peft_model(model, peft_config)\n", + "tokenizer = AutoTokenizer.from_pretrained(\"microsoft/phi-2\")\n", + "tokenizer.pad_token = tokenizer.eos_token" ] }, { "cell_type": "code", "execution_count": null, - "id": "eada9809-468a-47c6-9b03-55aa887c9487", + "id": "4392ddab-10b7-41f6-a8e0-65ba298ea457", "metadata": {}, "outputs": [], "source": [ - "# 8-bit quantization config\n", - "quantization_config = BitsAndBytesConfig(\n", - " load_in_8bit=True,\n", - " llm_int8_threshold=6.0,\n", - " llm_int8_has_fp16_weight=False,\n", - ")\n", + "# Dataset preparation\n", + "def format_instruction(sample):\n", + " return f\"\"\"### Instruction:\n", + "{sample['instruction']}\n", "\n", - "training_config = {\n", - " \"bf16\": True,\n", - " \"use_cpu\": False,\n", - " \"do_eval\": False,\n", - " \"learning_rate\": 5.0e-06,\n", - " \"log_level\": \"info\",\n", - " \"logging_steps\": 20,\n", - " \"lr_scheduler_type\": \"cosine\",\n", - " \"num_train_epochs\": 1,\n", - " \"output_dir\": \"./checkpoint_dir\",\n", - " \"overwrite_output_dir\": True,\n", - " \"per_device_eval_batch_size\": 1,\n", - " \"per_device_train_batch_size\": 1,\n", - " \"save_steps\": 100,\n", - " \"save_total_limit\": 1,\n", - " \"seed\": 0,\n", - " \"gradient_checkpointing\": True,\n", - " \"gradient_checkpointing_kwargs\": {\"use_reentrant\": False},\n", - " \"warmup_ratio\": 0.2,\n", - " \"optim\": \"adamw_8bit\", # Special 8-bit optimizer\n", - "}\n", + "### Input:\n", + "{sample['input']}\n", "\n", - "peft_config = {\n", - " \"r\": 8,\n", - " \"lora_alpha\": 16,\n", - " \"lora_dropout\": 0.1,\n", - " \"bias\": \"none\",\n", - " \"task_type\": \"CAUSAL_LM\",\n", - " \"target_modules\": [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n", - " \"modules_to_save\": [\"lm_head\", \"embed_tokens\"],\n", - "}\n", + "### Response:\n", + "{sample['output']}\"\"\"\n", "\n", - "model_kwargs = dict(\n", - " use_cache=False,\n", - " trust_remote_code=True,\n", - " quantization_config=quantization_config,\n", - " device_map=\"auto\",\n", - ")\n", + "dataset = load_dataset(\"json\", data_files=\"math_10k.json\")[\"train\"].train_test_split(test_size=0.1)\n", + "train_data = dataset[\"train\"].shuffle().select(range(100))\n", + "val_data = dataset[\"test\"].shuffle().select(range(20))\n", "\n", - "train_conf = TrainingArguments(**training_config)\n", - "peft_conf = LoraConfig(**peft_config)" + "train_data = train_data.map(lambda x: {\"text\": format_instruction(x)})\n", + "val_data = val_data.map(lambda x: {\"text\": format_instruction(x)})" ] }, { - "cell_type": "markdown", - "id": "ffe93234-2a1a-4809-a431-efe2f35ce496", + "cell_type": "code", + "execution_count": null, + "id": "6dc85c57-68b2-4514-9373-43e3d7c05c10", "metadata": {}, + "outputs": [], "source": [ - "## Load Quantized Model" + "class FederatedFlow(FLSpec):\n", + " def __init__(self, model=None, rounds=3, **kwargs):\n", + " super().__init__(**kwargs)\n", + " self.model = model\n", + " self.rounds = rounds\n", + " self.training_metrics = []\n", + " \n", + " @aggregator\n", + " def start(self):\n", + " print(\"Starting federated training\")\n", + " self.collaborators = self.runtime.collaborators\n", + " self.current_round = 0\n", + " self.next(self.aggregated_model_validation, foreach=\"collaborators\")\n", + " \n", + " @collaborator\n", + " def aggregated_model_validation(self):\n", + " trainer = SFTTrainer(\n", + " model=self.model,\n", + " args=training_config,\n", + " train_dataset=self.train_data,\n", + " eval_dataset=self.val_data,\n", + " dataset_text_field=\"text\",\n", + " max_seq_length=512,\n", + " tokenizer=tokenizer\n", + " )\n", + " metrics = trainer.evaluate()\n", + " self.validation_loss = metrics[\"eval_loss\"]\n", + " self.next(self.train)\n", + " \n", + " @collaborator\n", + " def train(self):\n", + " trainer = SFTTrainer(\n", + " model=self.model,\n", + " args=training_config,\n", + " train_dataset=self.train_data,\n", + " eval_dataset=self.val_data,\n", + " dataset_text_field=\"text\",\n", + " max_seq_length=512,\n", + " tokenizer=tokenizer\n", + " )\n", + " train_result = trainer.train()\n", + " self.training_loss = train_result.training_loss\n", + " self.training_metrics.append({\n", + " \"round\": self.current_round,\n", + " \"loss\": self.training_loss,\n", + " \"collaborator\": self.input\n", + " })\n", + " self.next(self.local_model_validation)\n", + " \n", + " @collaborator\n", + " def local_model_validation(self):\n", + " trainer = SFTTrainer(\n", + " model=self.model,\n", + " args=training_config,\n", + " train_dataset=self.train_data,\n", + " eval_dataset=self.val_data,\n", + " dataset_text_field=\"text\",\n", + " max_seq_length=512,\n", + " tokenizer=tokenizer\n", + " )\n", + " metrics = trainer.evaluate()\n", + " self.local_validation_loss = metrics[\"eval_loss\"]\n", + " self.next(self.join, exclude=[\"model\"])\n", + " \n", + " @aggregator\n", + " def join(self, inputs):\n", + " avg_loss = sum(input.training_loss for input in inputs) / len(inputs)\n", + " avg_val_loss = sum(input.validation_loss for input in inputs) / len(inputs)\n", + " \n", + " print(f\"Round {self.current_round} - Avg Training Loss: {avg_loss:.4f}\")\n", + " print(f\"Round {self.current_round} - Avg Validation Loss: {avg_val_loss:.4f}\")\n", + " \n", + " # Aggregate model updates\n", + " self.current_round += 1\n", + " if self.current_round < self.rounds:\n", + " self.next(self.aggregated_model_validation, foreach=\"collaborators\")\n", + " else:\n", + " self.next(self.end)\n", + " \n", + " @aggregator\n", + " def end(self):\n", + " print(\"Training complete!\")\n", + " print(\"Final Training Metrics:\")\n", + " for metric in self.training_metrics:\n", + " print(f\"Round {metric['round']} - {metric['collaborator']} - Loss: {metric['loss']:.4f}\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "8ab371f1-64c3-4225-82e7-fb3c5b05578c", + "id": "e5e108c6-5150-4931-9c01-6b64a913fa04", "metadata": {}, "outputs": [], "source": [ - "checkpoint_path = \"microsoft/phi-1_5\"\n", - "model = AutoModelForCausalLM.from_pretrained(\n", - " checkpoint_path, return_dict=True, **model_kwargs\n", - ")\n", - "model = get_peft_model(model, peft_conf)\n", + "# Setup runtime\n", + "aggregator = Aggregator()\n", + "collaborators = [\n", + " Collaborator(name=\"Portland\", private_attributes={\"train_data\": train_data.shard(2, 0), \"val_data\": val_data.shard(2, 0)}),\n", + " Collaborator(name=\"Seattle\", private_attributes={\"train_data\": train_data.shard(2, 1), \"val_data\": val_data.shard(2, 1)})\n", + "]\n", "\n", - "tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)\n", - "sequence_max_length = 512\n", - "val_set_size = 2000\n", - "tokenizer.pad_token_id = 0\n", - "tokenizer.padding_side = \"left\"" - ] - }, - { - "cell_type": "markdown", - "id": "dd058fff-f6dd-4cc6-acaf-7e2fa2c1132d", - "metadata": {}, - "source": [ - "## Dataset Preprocessing" + "runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators, backend=\"single_process\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "4392ddab-10b7-41f6-a8e0-65ba298ea457", + "id": "38894111-41d9-4dd4-b1c8-eb7ec3cdd3e1", "metadata": {}, "outputs": [], "source": [ - "def generate_prompt(data_point):\n", - " if data_point[\"input\"]:\n", - " return f\"\"\"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. \n", - "\n", - " ### Instruction:\n", - " {data_point[\"instruction\"]}\n", - " \n", - " ### Input:\n", - " {data_point[\"input\"]}\n", - " \n", - " ### Response:\n", - " {data_point[\"output\"]}\"\"\"\n", - " else:\n", - " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request. \n", - "\n", - " ### Instruction:\n", - " {data_point[\"instruction\"]}\n", - " \n", - " ### Response:\n", - " {data_point[\"output\"]}\"\"\"\n", - "\n", - "def tokenize(prompt, add_eos_token=True):\n", - " result = tokenizer(\n", - " prompt,\n", - " truncation=True,\n", - " max_length=sequence_max_length,\n", - " padding=False,\n", - " return_tensors=None,\n", - " )\n", - " if (\n", - " result[\"input_ids\"][-1] != tokenizer.eos_token_id\n", - " and len(result[\"input_ids\"]) < sequence_max_length\n", - " and add_eos_token\n", - " ):\n", - " result[\"input_ids\"].append(tokenizer.eos_token_id)\n", - " result[\"attention_mask\"].append(1)\n", - "\n", - " result[\"labels\"] = result[\"input_ids\"].copy()\n", - " return result\n", - "\n", - "def generate_and_tokenize_prompt(data_point):\n", - " full_prompt = generate_prompt(data_point)\n", - " tokenized_full_prompt = tokenize(full_prompt)\n", - " user_prompt = generate_prompt({**data_point, \"output\": \"\"})\n", - " tokenized_user_prompt = tokenize(user_prompt, add_eos_token=False)\n", - " user_prompt_len = len(tokenized_user_prompt[\"input_ids\"])\n", - "\n", - " tokenized_full_prompt[\"labels\"] = [-100] * user_prompt_len + tokenized_full_prompt[\"labels\"][user_prompt_len:]\n", - " return tokenized_full_prompt\n", - "\n", - "train_val = raw_dataset[\"train\"].train_test_split(test_size=val_set_size, shuffle=True, seed=42)\n", - "processed_train_dataset = train_val[\"train\"].shuffle().map(generate_and_tokenize_prompt).select(range(3))\n", - "processed_test_dataset = train_val[\"test\"].shuffle().map(generate_and_tokenize_prompt).select(range(3))" + "# Run training\n", + "flow = FederatedFlow(model, rounds=2)\n", + "flow.runtime = runtime\n", + "flow.run()" ] }, { "cell_type": "markdown", - "id": "812cfcc8-33ec-4a2b-8a74-27bfc2a41d7b", + "id": "7bc8fe27", "metadata": {}, "source": [ - "## Federated Averaging with Quantization Support" + "## Key Features:\n", + "\n", + "1. **8-bit Quantization**: Enabled through BitsAndBytesConfig\n", + "2. **Enhanced Training Metrics**: Tracks and reports loss at each round\n", + "3. **PEFT with LoRA**: Parameter-efficient fine-tuning configuration\n", + "4. **Memory Optimization**: 8-bit weights and gradient accumulation\n", + "5. **Validation Tracking**: Separate validation before and after training" ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" }, - { - "cell_type": "code", - "execution_count": null, - "id": "6dc85c57-68b2-4514-9373-43e3d7c05c10", - "metadata": {}, - "outputs": [], - "source": [ - "def FedAvg(peft_params, model, weights=None):\n", - " state_dicts = peft_params\n", - " state_dict = get_peft_model_state_dict(model)\n", - " for key in peft_params[0]:\n", - " dtype = state_dicts[0][key].dtype\n", - " state_dict[key] = torch.from_numpy(\n", - " np.average(\n", - " [state[key].to(torch.float).numpy() for state in state_dicts], axis=0, weights=weights\n", - " )\n", - " ).to(dtype)\n", - " \n", - " # Handle quantization when setting state dict\n", - " set_peft_model_state_dict(model, state_dict)\n", - " return model" - ] + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From a3b71a5b57dac5083efed6846f5e34e260faf9aa Mon Sep 17 00:00:00 2001 From: rajithkrishnegowda <134698520+rajithkrishnegowda@users.noreply.github.com> Date: Mon, 12 May 2025 15:31:56 +0530 Subject: [PATCH 21/34] Update phi-4-quanti.ipynb --- openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb index 8435e9e418..7a53e6fceb 100644 --- a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb +++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb @@ -110,9 +110,9 @@ "outputs": [], "source": [ "# Load model and tokenizer\n", - "model = AutoModelForCausalLM.from_pretrained(\"microsoft/phi-2\", **model_kwargs)\n", + "model = AutoModelForCausalLM.from_pretrained(\"microsoft/phi-4\", **model_kwargs)\n", "model = get_peft_model(model, peft_config)\n", - "tokenizer = AutoTokenizer.from_pretrained(\"microsoft/phi-2\")\n", + "tokenizer = AutoTokenizer.from_pretrained(\"microsoft/phi-4\")\n", "tokenizer.pad_token = tokenizer.eos_token" ] }, From 0380c355235cb7d55fca1380b73232888ab7f4cf Mon Sep 17 00:00:00 2001 From: rajithkrishnegowda <134698520+rajithkrishnegowda@users.noreply.github.com> Date: Mon, 12 May 2025 15:52:08 +0530 Subject: [PATCH 22/34] Update phi-4-quanti.ipynb --- .../workflow/LLM/phi-4-quanti.ipynb | 63 ++++++++----------- 1 file changed, 26 insertions(+), 37 deletions(-) diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb index 7a53e6fceb..d7cc2b8616 100644 --- a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb +++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb @@ -13,7 +13,7 @@ "id": "20c74cb9-51a2-42e2-893f-d280e227e8bf", "metadata": {}, "source": [ - "This notebook demonstrates federated fine-tuning of Microsoft's Phi-4 model with 8-bit quantization using OpenFL." + "This notebook demonstrates federated fine-tuning of Microsoft's Phi-4 model (4B parameters) with 8-bit quantization using OpenFL." ] }, { @@ -57,7 +57,7 @@ "metadata": {}, "outputs": [], "source": [ - "# 8-bit quantization config\n", + "# 8-bit quantization config for Phi-4\n", "quant_config = BitsAndBytesConfig(\n", " load_in_8bit=True,\n", " llm_int8_threshold=6.0,\n", @@ -66,37 +66,38 @@ " llm_int8_has_fp16_weight=False\n", ")\n", "\n", - "# Model config\n", + "# Model config for Phi-4\n", "model_kwargs = {\n", " \"quantization_config\": quant_config,\n", " \"device_map\": \"auto\",\n", - " \"trust_remote_code\": True\n", + " \"trust_remote_code\": True,\n", + " \"torch_dtype\": torch.bfloat16\n", "}\n", "\n", - "# PEFT config\n", + "# PEFT config optimized for Phi-4\n", "peft_config = LoraConfig(\n", - " r=8,\n", - " lora_alpha=16,\n", - " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\"],\n", + " r=16, # Higher rank for larger model\n", + " lora_alpha=32,\n", + " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n", " lora_dropout=0.05,\n", " bias=\"none\",\n", " task_type=\"CAUSAL_LM\"\n", ")\n", "\n", - "# Training config\n", + "# Training config adjusted for Phi-4\n", "training_config = TrainingArguments(\n", " output_dir=\"./results\",\n", - " per_device_train_batch_size=2,\n", - " per_device_eval_batch_size=2,\n", - " gradient_accumulation_steps=4,\n", - " learning_rate=2e-5,\n", + " per_device_train_batch_size=1, # Reduced for 4B model\n", + " per_device_eval_batch_size=1,\n", + " gradient_accumulation_steps=8, # Increased for memory efficiency\n", + " learning_rate=1e-5, # Lower learning rate for larger model\n", " logging_steps=10,\n", " num_train_epochs=1,\n", " max_grad_norm=0.3,\n", " warmup_ratio=0.03,\n", " lr_scheduler_type=\"cosine\",\n", " save_steps=100,\n", - " fp16=True,\n", + " bf16=True, # Using bfloat16 for Phi-4\n", " optim=\"adamw_torch\",\n", " report_to=\"none\"\n", ")" @@ -109,11 +110,12 @@ "metadata": {}, "outputs": [], "source": [ - "# Load model and tokenizer\n", + "# Load Phi-4 model and tokenizer\n", "model = AutoModelForCausalLM.from_pretrained(\"microsoft/phi-4\", **model_kwargs)\n", "model = get_peft_model(model, peft_config)\n", "tokenizer = AutoTokenizer.from_pretrained(\"microsoft/phi-4\")\n", - "tokenizer.pad_token = tokenizer.eos_token" + "tokenizer.pad_token = tokenizer.eos_token\n", + "tokenizer.padding_side = \"left\"" ] }, { @@ -123,7 +125,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Dataset preparation\n", + "# Dataset preparation for Phi-4\n", "def format_instruction(sample):\n", " return f\"\"\"### Instruction:\n", "{sample['instruction']}\n", @@ -135,8 +137,8 @@ "{sample['output']}\"\"\"\n", "\n", "dataset = load_dataset(\"json\", data_files=\"math_10k.json\")[\"train\"].train_test_split(test_size=0.1)\n", - "train_data = dataset[\"train\"].shuffle().select(range(100))\n", - "val_data = dataset[\"test\"].shuffle().select(range(20))\n", + "train_data = dataset[\"train\"].shuffle().select(range(50)) # Smaller subset for Phi-4\n", + "val_data = dataset[\"test\"].shuffle().select(range(10))\n", "\n", "train_data = train_data.map(lambda x: {\"text\": format_instruction(x)})\n", "val_data = val_data.map(lambda x: {\"text\": format_instruction(x)})" @@ -158,7 +160,7 @@ " \n", " @aggregator\n", " def start(self):\n", - " print(\"Starting federated training\")\n", + " print(\"Starting federated training for Phi-4\")\n", " self.collaborators = self.runtime.collaborators\n", " self.current_round = 0\n", " self.next(self.aggregated_model_validation, foreach=\"collaborators\")\n", @@ -230,7 +232,7 @@ " \n", " @aggregator\n", " def end(self):\n", - " print(\"Training complete!\")\n", + " print(\"Phi-4 Training complete!\")\n", " print(\"Final Training Metrics:\")\n", " for metric in self.training_metrics:\n", " print(f\"Round {metric['round']} - {metric['collaborator']} - Loss: {metric['loss']:.4f}\")" @@ -243,7 +245,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Setup runtime\n", + "# Setup runtime for Phi-4\n", "aggregator = Aggregator()\n", "collaborators = [\n", " Collaborator(name=\"Portland\", private_attributes={\"train_data\": train_data.shard(2, 0), \"val_data\": val_data.shard(2, 0)}),\n", @@ -260,26 +262,13 @@ "metadata": {}, "outputs": [], "source": [ - "# Run training\n", + "# Run training for Phi-4\n", "flow = FederatedFlow(model, rounds=2)\n", "flow.runtime = runtime\n", "flow.run()" ] }, - { - "cell_type": "markdown", - "id": "7bc8fe27", - "metadata": {}, - "source": [ - "## Key Features:\n", - "\n", - "1. **8-bit Quantization**: Enabled through BitsAndBytesConfig\n", - "2. **Enhanced Training Metrics**: Tracks and reports loss at each round\n", - "3. **PEFT with LoRA**: Parameter-efficient fine-tuning configuration\n", - "4. **Memory Optimization**: 8-bit weights and gradient accumulation\n", - "5. **Validation Tracking**: Separate validation before and after training" - ] - } + ], "metadata": { "kernelspec": { From 0335de7ae96f6441f3129e38e79078028091171b Mon Sep 17 00:00:00 2001 From: rajithkrishnegowda <134698520+rajithkrishnegowda@users.noreply.github.com> Date: Mon, 12 May 2025 15:52:51 +0530 Subject: [PATCH 23/34] Update phi-4-quanti.ipynb --- .../workflow/LLM/phi-4-quanti.ipynb | 41 ++++++++++--------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb index d7cc2b8616..9780a0d63a 100644 --- a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb +++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb @@ -268,26 +268,27 @@ "flow.run()" ] }, - - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - } + { + "cell_type": "markdown", + "id": "7bc8fe27", + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } }, "nbformat": 4, "nbformat_minor": 5 From 7bb70cfc478d6d9655a6ec5ad4002d7babfb2bed Mon Sep 17 00:00:00 2001 From: rajithkrishnegowda <134698520+rajithkrishnegowda@users.noreply.github.com> Date: Mon, 12 May 2025 15:53:13 +0530 Subject: [PATCH 24/34] Update phi-4-quanti.ipynb --- .../workflow/LLM/phi-4-quanti.ipynb | 41 +++++++++++-------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb index 9780a0d63a..59ae25d98a 100644 --- a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb +++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb @@ -271,24 +271,29 @@ { "cell_type": "markdown", "id": "7bc8fe27", - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - } + "metadata": {}, + "source": [ + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } }, "nbformat": 4, "nbformat_minor": 5 From 02f194248a410fac9cda6668b689dccdcb3f4a30 Mon Sep 17 00:00:00 2001 From: rajithkrishnegowda <134698520+rajithkrishnegowda@users.noreply.github.com> Date: Thu, 15 May 2025 18:02:27 +0530 Subject: [PATCH 25/34] Add files via upload --- .../experimental/workflow/LLM/phi-4-sol.ipynb | 438 ++++++++++++++++++ 1 file changed, 438 insertions(+) create mode 100644 openfl-tutorials/experimental/workflow/LLM/phi-4-sol.ipynb diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-sol.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-sol.ipynb new file mode 100644 index 0000000000..b1ca1bac6e --- /dev/null +++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-sol.ipynb @@ -0,0 +1,438 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a59f475d-d843-46bc-b75e-10984b687ed3", + "metadata": {}, + "source": [ + "# Enhanced Federated Fine-Tuning of Phi-4 Using OpenFL with PEFT & Quantization" + ] + }, + { + "cell_type": "markdown", + "id": "20c74cb9-51a2-42e2-893f-d280e227e8bf", + "metadata": {}, + "source": [ + "In this tutorial, we demonstrate how to fine-tune Microsoft's Phi-4 model in a federated learning workflow with enhanced local training using:\n", + "- Parameter-Efficient Fine-Tuning (PEFT)\n", + "- 4-bit Quantization (QLoRA)\n", + "- Gradient Checkpointing\n", + "- Optimized Training Configuration" + ] + }, + { + "cell_type": "markdown", + "id": "d07c32d3-1a8d-4162-af45-bc3a10e0ae3f", + "metadata": {}, + "source": [ + "## Installation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05b2ad75-8c7b-499c-902e-dbd5b24361bc", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install torch transformers peft datasets trl==0.12.2 bitsandbytes accelerate -q" + ] + }, + { + "cell_type": "markdown", + "id": "440a9c39-ec42-45a5-80f6-9a9e0bc90d2f", + "metadata": {}, + "source": [ + "## Import Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be4690ae-0671-4d3a-8f21-620ab865a03e", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import torch\n", + "from transformers import (\n", + " AutoModelForCausalLM,\n", + " AutoTokenizer,\n", + " BitsAndBytesConfig,\n", + " TrainingArguments\n", + ")\n", + "from peft import (\n", + " LoraConfig,\n", + " get_peft_model,\n", + " prepare_model_for_kbit_training,\n", + " PeftModel\n", + ")\n", + "from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict # Added this import\n", + "from datasets import load_dataset\n", + "from trl import SFTTrainer\n", + "from openfl.experimental.workflow import FLSpec, Aggregator, Collaborator, LocalRuntime\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "id": "08576aa0-f628-4ae6-8fc3-dd167d164784", + "metadata": {}, + "source": [ + "## Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eada9809-468a-47c6-9b03-55aa887c9487", + "metadata": {}, + "outputs": [], + "source": [ + "# Model and dataset\n", + "model_name = \"microsoft/phi-4\"\n", + "dataset_name = \"math_10k.json\"\n", + "\n", + "# QLoRA configuration\n", + "bnb_config = BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_quant_type=\"nf4\",\n", + " bnb_4bit_compute_dtype=torch.bfloat16,\n", + " bnb_4bit_use_double_quant=True,\n", + ")\n", + "\n", + "# LoRA configuration\n", + "peft_config = LoraConfig(\n", + " r=16, # Increased from original for better adaptation\n", + " lora_alpha=32,\n", + " lora_dropout=0.05,\n", + " bias=\"none\",\n", + " task_type=\"CAUSAL_LM\",\n", + " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"dense\"],\n", + ")\n", + "\n", + "# Training configuration\n", + "training_args = TrainingArguments(\n", + " output_dir=\"./results\",\n", + " num_train_epochs=1,\n", + " per_device_train_batch_size=1, # Reduced for Phi-4\n", + " gradient_accumulation_steps=2,\n", + " optim=\"paged_adamw_32bit\",\n", + " save_steps=100,\n", + " logging_steps=10,\n", + " learning_rate=2e-4,\n", + " weight_decay=0.001,\n", + " fp16=False,\n", + " bf16=True,\n", + " max_grad_norm=0.3,\n", + " warmup_ratio=0.03,\n", + " lr_scheduler_type=\"cosine\",\n", + " gradient_checkpointing=True,\n", + " report_to=\"none\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ffe93234-2a1a-4809-a431-efe2f35ce496", + "metadata": {}, + "source": [ + "## Load and Prepare Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ab371f1-64c3-4225-82e7-fb3c5b05578c", + "metadata": {}, + "outputs": [], + "source": [ + "# Load tokenizer\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n", + "tokenizer.pad_token = tokenizer.eos_token\n", + "tokenizer.padding_side = \"right\"\n", + "\n", + "# Load model with quantization\n", + "model = AutoModelForCausalLM.from_pretrained(\n", + " model_name,\n", + " quantization_config=bnb_config,\n", + " device_map=\"auto\",\n", + " trust_remote_code=True\n", + ")\n", + "\n", + "# Prepare model for k-bit training\n", + "model = prepare_model_for_kbit_training(model)\n", + "\n", + "# Apply LoRA\n", + "model = get_peft_model(model, peft_config)\n", + "model.print_trainable_parameters()" + ] + }, + { + "cell_type": "markdown", + "id": "dd058fff-f6dd-4cc6-acaf-7e2fa2c1132d", + "metadata": {}, + "source": [ + "## Load and Prepare Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4392ddab-10b7-41f6-a8e0-65ba298ea457", + "metadata": {}, + "outputs": [], + "source": [ + "def format_prompt(example):\n", + " if example[\"input\"]:\n", + " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n", + "\n", + "### Instruction:\n", + "{example['instruction']}\n", + "\n", + "### Input:\n", + "{example['input']}\n", + "\n", + "### Response:\n", + "{example['output']}\"\"\"\n", + " else:\n", + " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n", + "\n", + "### Instruction:\n", + "{example['instruction']}\n", + "\n", + "### Response:\n", + "{example['output']}\"\"\"\n", + "\n", + "# Load dataset\n", + "dataset = load_dataset(\"json\", data_files=dataset_name, split=\"train\")\n", + "dataset = dataset.map(lambda x: {\"text\": format_prompt(x)})\n", + "\n", + "# Split dataset\n", + "dataset = dataset.train_test_split(test_size=0.1)\n", + "train_dataset = dataset[\"train\"]\n", + "eval_dataset = dataset[\"test\"]" + ] + }, + { + "cell_type": "markdown", + "id": "812cfcc8-33ec-4a2b-8a74-27bfc2a41d7b", + "metadata": {}, + "source": [ + "## Enhanced Training with SFTTrainer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6dc85c57-68b2-4514-9373-43e3d7c05c10", + "metadata": {}, + "outputs": [], + "source": [ + "trainer = SFTTrainer(\n", + " model=model,\n", + " train_dataset=train_dataset,\n", + " eval_dataset=eval_dataset,\n", + " peft_config=peft_config,\n", + " dataset_text_field=\"text\",\n", + " max_seq_length=1024,\n", + " tokenizer=tokenizer,\n", + " args=training_args,\n", + " packing=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "810eb75e", + "metadata": {}, + "source": [ + "## Federated Averaging Function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58298e8e-ab9e-4377-966e-143823441697", + "metadata": {}, + "outputs": [], + "source": [ + "def FedAvg(peft_params, model, weights=None):\n", + " \"\"\"\n", + " Perform Federated Averaging (FedAvg) on the model parameters.\n", + " \"\"\"\n", + " state_dicts = peft_params\n", + " state_dict = get_peft_model_state_dict(model)\n", + " for key in peft_params[0]:\n", + " dtype = state_dicts[0][key].dtype\n", + " state_dict[key] = torch.from_numpy(\n", + " np.average(\n", + " [state[key].to(torch.float).numpy() for state in state_dicts], \n", + " axis=0, \n", + " weights=weights\n", + " )\n", + " ).to(dtype)\n", + " set_peft_model_state_dict(model, state_dict)\n", + " return model" + ] + }, + { + "cell_type": "markdown", + "id": "e120a656-f4a5-47a5-a3d4-62c5f3672bba", + "metadata": {}, + "source": [ + "## Federated Learning Workflow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5e108c6-5150-4931-9c01-6b64a913fa04", + "metadata": {}, + "outputs": [], + "source": [ + "class FederatedFlow(FLSpec):\n", + " def __init__(self, model=None, rounds=3, **kwargs):\n", + " super().__init__(**kwargs)\n", + " if model is not None:\n", + " self.model = model\n", + " self.peft_params = get_peft_model_state_dict(self.model)\n", + " else:\n", + " raise ValueError(\"No model provided\")\n", + " \n", + " self.rounds = rounds\n", + " \n", + " @aggregator\n", + " def start(self):\n", + " print(\"Initializing federated learning\")\n", + " self.collaborators = self.runtime.collaborators\n", + " self.current_round = 0\n", + " self.next(self.aggregated_model_validation, foreach=\"collaborators\")\n", + " \n", + " @collaborator\n", + " def aggregated_model_validation(self):\n", + " print(f\"Validating aggregated model for {self.input}\")\n", + " # Load model with quantization\n", + " self.model = AutoModelForCausalLM.from_pretrained(\n", + " model_name,\n", + " quantization_config=bnb_config,\n", + " device_map=\"auto\",\n", + " trust_remote_code=True\n", + " )\n", + " self.model = prepare_model_for_kbit_training(self.model)\n", + " self.model = get_peft_model(self.model, peft_config)\n", + " set_peft_model_state_dict(self.model, self.peft_params)\n", + " \n", + " # Evaluate\n", + " eval_results = trainer.evaluate()\n", + " self.agg_validation_score = eval_results[\"eval_loss\"]\n", + " print(f\"Validation loss: {self.agg_validation_score}\")\n", + " self.next(self.train)\n", + " \n", + " @collaborator\n", + " def train(self):\n", + " print(f\"Training on {self.input}\")\n", + " # Train with local data\n", + " trainer.train()\n", + " self.loss = trainer.state.log_history[-1][\"loss\"]\n", + " self.next(self.local_model_validation)\n", + " \n", + " @collaborator\n", + " def local_model_validation(self):\n", + " print(f\"Validating local model for {self.input}\")\n", + " eval_results = trainer.evaluate()\n", + " self.local_validation_score = eval_results[\"eval_loss\"]\n", + " self.peft_params = get_peft_model_state_dict(self.model)\n", + " self.next(self.join, exclude=[\"model\"])\n", + " \n", + " @aggregator\n", + " def join(self, inputs):\n", + " self.average_loss = sum(input.loss for input in inputs) / len(inputs)\n", + " self.aggregated_model_accuracy = sum(\n", + " input.agg_validation_score for input in inputs\n", + " ) / len(inputs)\n", + " self.local_model_accuracy = sum(\n", + " input.local_validation_score for input in inputs\n", + " ) / len(inputs)\n", + " \n", + " print(f\"Round {self.current_round + 1} results:\")\n", + " print(f\"Average training loss: {self.average_loss}\")\n", + " print(f\"Average validation loss (before training): {self.aggregated_model_accuracy}\")\n", + " print(f\"Average validation loss (after training): {self.local_model_accuracy}\")\n", + " \n", + " # Federated averaging\n", + " self.model = FedAvg([input.peft_params for input in inputs], self.model)\n", + " self.peft_params = get_peft_model_state_dict(self.model)\n", + " \n", + " self.current_round += 1\n", + " if self.current_round < self.rounds:\n", + " self.next(self.aggregated_model_validation, foreach=\"collaborators\")\n", + " else:\n", + " self.next(self.end)\n", + " \n", + " @aggregator\n", + " def end(self):\n", + " print(\"Federated training complete!\")\n", + " print(f\"Final model validation loss: {self.aggregated_model_accuracy}\")" + ] + }, + { + "cell_type": "markdown", + "id": "7bc8fe27", + "metadata": {}, + "source": [ + "## Run Federated Learning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38894111-41d9-4dd4-b1c8-eb7ec3cdd3e1", + "metadata": {}, + "outputs": [], + "source": [ + "# Setup participants\n", + "aggregator = Aggregator()\n", + "collaborators = [\n", + " Collaborator(name=\"Portland\"),\n", + " Collaborator(name=\"Seattle\"),\n", + " Collaborator(name=\"London\")\n", + "]\n", + "\n", + "# Assign data shards\n", + "for idx, colab in enumerate(collaborators):\n", + " colab.private_attributes = {\n", + " \"train_dataset\": train_dataset.shard(len(collaborators), idx),\n", + " \"eval_dataset\": eval_dataset.shard(len(collaborators), idx)\n", + " }\n", + "\n", + "# Create and run workflow\n", + "runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators)\n", + "flflow = FederatedFlow(model, rounds=3)\n", + "flflow.runtime = runtime\n", + "flflow.run()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 173c1e3b3ceee3a42abe387ca0bbb4693a98b808 Mon Sep 17 00:00:00 2001 From: Rajith Date: Fri, 16 May 2025 11:04:32 +0530 Subject: [PATCH 26/34] adding phi-4 with 4 bit quantization --- .../LLM/phi-4-with4bit quantization.ipynb | 1772 +++++++++++++++++ .../experimental/workflow/LLM/phi-4.ipynb | 705 ------- 2 files changed, 1772 insertions(+), 705 deletions(-) create mode 100644 openfl-tutorials/experimental/workflow/LLM/phi-4-with4bit quantization.ipynb delete mode 100644 openfl-tutorials/experimental/workflow/LLM/phi-4.ipynb diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-with4bit quantization.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-with4bit quantization.ipynb new file mode 100644 index 0000000000..e2efa9054b --- /dev/null +++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-with4bit quantization.ipynb @@ -0,0 +1,1772 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a59f475d-d843-46bc-b75e-10984b687ed3", + "metadata": {}, + "source": [ + "# Enhanced Federated Fine-Tuning of Phi-4 Using OpenFL with PEFT & Quantization" + ] + }, + { + "cell_type": "markdown", + "id": "20c74cb9-51a2-42e2-893f-d280e227e8bf", + "metadata": {}, + "source": [ + "In this tutorial, we demonstrate how to fine-tune Microsoft's Phi-4 model in a federated learning workflow with enhanced local training using:\n", + "- Parameter-Efficient Fine-Tuning (PEFT)\n", + "- 4-bit Quantization (QLoRA)\n", + "- Gradient Checkpointing\n", + "- Optimized Training Configuration" + ] + }, + { + "cell_type": "markdown", + "id": "d07c32d3-1a8d-4162-af45-bc3a10e0ae3f", + "metadata": {}, + "source": [ + "## Installation" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "05b2ad75-8c7b-499c-902e-dbd5b24361bc", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install torch transformers peft datasets trl==0.12.2 bitsandbytes accelerate -q" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a7ae1a7e-8c16-4c5a-be57-33d84723aed7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Thu May 15 13:27:27 2025 \n", + "+-----------------------------------------------------------------------------------------+\n", + "| NVIDIA-SMI 550.144.03 Driver Version: 550.144.03 CUDA Version: 12.4 |\n", + "|-----------------------------------------+------------------------+----------------------+\n", + "| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n", + "| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n", + "| | | MIG M. |\n", + "|=========================================+========================+======================|\n", + "| 0 NVIDIA H100 NVL Off | 00000001:00:00.0 Off | 0 |\n", + "| N/A 39C P0 62W / 400W | 1MiB / 95830MiB | 0% Default |\n", + "| | | Disabled |\n", + "+-----------------------------------------+------------------------+----------------------+\n", + " \n", + "+-----------------------------------------------------------------------------------------+\n", + "| Processes: |\n", + "| GPU GI CI PID Type Process name GPU Memory |\n", + "| ID ID Usage |\n", + "|=========================================================================================|\n", + "| No running processes found |\n", + "+-----------------------------------------------------------------------------------------+\n" + ] + } + ], + "source": [ + "!nvidia-smi" + ] + }, + { + "cell_type": "markdown", + "id": "440a9c39-ec42-45a5-80f6-9a9e0bc90d2f", + "metadata": {}, + "source": [ + "## Import Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be4690ae-0671-4d3a-8f21-620ab865a03e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/azureuser/env_name/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "2025-05-15 13:27:30,648\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n" + ] + } + ], + "source": [ + "# System imports\n", + "import os\n", + "import numpy as np\n", + "\n", + "# PyTorch imports\n", + "import torch\n", + "\n", + "# Hugging Face Transformers imports for model loading and training\n", + "from transformers import (\n", + " AutoModelForCausalLM, # For loading large language models\n", + " AutoTokenizer, # For tokenizing text inputs\n", + " BitsAndBytesConfig, # For 4-bit quantization configuration\n", + " TrainingArguments # For configuring training hyperparameters\n", + ")\n", + "\n", + "# PEFT (Parameter-Efficient Fine-Tuning) imports\n", + "from peft import (\n", + " LoraConfig, # For configuring Low-Rank Adaptation\n", + " get_peft_model, # For applying PEFT to a model\n", + " prepare_model_for_kbit_training, # For preparing quantized models for training\n", + " PeftModel # Base class for PEFT models\n", + ")\n", + "from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict # For state dict manipulation\n", + "\n", + "# Dataset and training imports\n", + "from datasets import load_dataset\n", + "from trl import SFTTrainer # Supervised Fine-Tuning Trainer\n", + "\n", + "# OpenFL imports for federated learning\n", + "from openfl.experimental.workflow.interface import Aggregator, Collaborator, FLSpec\n", + "from openfl.experimental.workflow.placement import aggregator, collaborator\n", + "from openfl.experimental.workflow.runtime import LocalRuntime" + ] + }, + { + "cell_type": "markdown", + "id": "06274755", + "metadata": {}, + "source": [ + "## Acquiring and preprocessing dataset" + ] + }, + { + "cell_type": "markdown", + "id": "a6edefa4", + "metadata": {}, + "source": [ + "We can download the dataset directly from the [LLM-Adapters repository](https://github.com/AGI-Edgerunners/LLM-Adapters)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "962ac825", + "metadata": {}, + "outputs": [], + "source": [ + "# Import libraries needed for downloading and verifying the dataset\n", + "import hashlib\n", + "import requests\n", + "\n", + "def file_checksum(file_path, algorithm=\"sha256\"):\n", + " \"\"\"\n", + " Calculate the checksum of a file using the specified hashing algorithm.\n", + " \n", + " Args:\n", + " file_path (str): The path to the file for which the checksum is to be calculated.\n", + " algorithm (str): The hashing algorithm to use (default is 'sha256').\n", + " \n", + " Returns:\n", + " str: The calculated checksum of the file.\n", + " \"\"\"\n", + " hash_func = hashlib.new(algorithm)\n", + " with open(file_path, \"rb\") as f:\n", + " for chunk in iter(lambda: f.read(4096), b\"\"):\n", + " hash_func.update(chunk)\n", + " return hash_func.hexdigest()\n", + "\n", + "\n", + "# Download the dataset if it doesn't exist locally\n", + "if not os.path.exists(\"math_10k.json\"):\n", + " print(\"Downloading math_10k.json dataset...\")\n", + " r = requests.get(\n", + " \"https://raw.githubusercontent.com/AGI-Edgerunners/LLM-Adapters/main/ft-training_set/math_10k.json\",\n", + " )\n", + " with open(\n", + " \"math_10k.json\",\n", + " \"wb\",\n", + " ) as f:\n", + " f.write(r.content)\n", + " print(\"Download complete.\")\n", + "\n", + " # Verify the integrity of the downloaded file\n", + " actual_checksum = file_checksum(\"math_10k.json\")\n", + " expected_checksum = \"0342d0d860ad8592b579329337c90e42eefd3d9f2898043140cbd120630418b8\"\n", + " if actual_checksum != expected_checksum:\n", + " raise ValueError(\n", + " \"Checksum verification failed. The file may have been altered.\"\n", + " )\n", + " print(\"Checksum verification successful.\")\n", + "else:\n", + " print(\"Dataset already exists locally.\")\n", + "\n", + "# Set the dataset path to be used later\n", + "dataset_name = \"math_10k.json\"" + ] + }, + { + "cell_type": "markdown", + "id": "08576aa0-f628-4ae6-8fc3-dd167d164784", + "metadata": {}, + "source": [ + "## Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eada9809-468a-47c6-9b03-55aa887c9487", + "metadata": {}, + "outputs": [], + "source": [ + "# Model and dataset configuration\n", + "model_name = \"microsoft/phi-4\" # Pre-trained model identifier from Hugging Face Hub\n", + "#dataset_name = \"math_10k.json\" # Dataset file containing mathematical QA pairs\n", + "\n", + "# QLoRA (Quantized Low-Rank Adaptation) configuration for 4-bit quantization\n", + "# This reduces memory footprint while maintaining model quality\n", + "bnb_config = BitsAndBytesConfig(\n", + " load_in_4bit=True, # Enable 4-bit quantization\n", + " bnb_4bit_quant_type=\"nf4\", # Use normalized float 4 format for better precision\n", + " bnb_4bit_compute_dtype=torch.bfloat16, # Computation precision\n", + " bnb_4bit_use_double_quant=False, # Disable nested quantization for simplicity\n", + ")\n", + "\n", + "# LoRA (Low-Rank Adaptation) configuration for parameter-efficient fine-tuning\n", + "# This allows fine-tuning with significantly fewer parameters\n", + "peft_config = LoraConfig(\n", + " r=8, # Rank of the update matrices (higher = more capacity but more parameters)\n", + " lora_alpha=16, # Scaling factor for the trained weights\n", + " lora_dropout=0.01, # Dropout probability for LoRA layers\n", + " bias=\"none\", # Don't train bias parameters to reduce memory\n", + " task_type=\"CAUSAL_LM\", # Specify causal language modeling task\n", + " target_modules=\"all-linear\", # Apply LoRA to all linear layers\n", + ")\n", + "\n", + "# Training hyperparameters configuration\n", + "training_args = TrainingArguments(\n", + " output_dir=\"./results\", # Directory to save checkpoints and logs\n", + " num_train_epochs=1, # Number of training epochs\n", + " per_device_train_batch_size=2, # Batch size per GPU/TPU core\n", + " gradient_accumulation_steps=2, # Number of updates steps to accumulate before backward pass\n", + " optim=\"adamw_torch_fused\", # Optimizer to use (fused for better performance)\n", + " save_steps=100, # Save checkpoint every X updates steps\n", + " logging_steps=10, # Log metrics every X updates steps\n", + " learning_rate=3e-4, # Initial learning rate\n", + " weight_decay=0.001, # Weight decay regularization\n", + " fp16=False, # Disable FP16 training (using BF16 instead)\n", + " bf16=True, # Enable BF16 training (better numerical stability than FP16)\n", + " max_grad_norm=0.5, # Max gradient norm for gradient clipping\n", + " warmup_ratio=0.02, # Portion of steps for learning rate warmup\n", + " lr_scheduler_type=\"cosine\", # Learning rate scheduler type\n", + " gradient_checkpointing=True, # Enable gradient checkpointing to save memory\n", + " report_to=\"none\" # Disable reporting to tracking platforms\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ffe93234-2a1a-4809-a431-efe2f35ce496", + "metadata": {}, + "source": [ + "## Load and Prepare Model" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8ab371f1-64c3-4225-82e7-fb3c5b05578c", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.36it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "trainable params: 27,852,800 || all params: 14,687,360,000 || trainable%: 0.1896\n" + ] + } + ], + "source": [ + "# Load tokenizer\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n", + "tokenizer.pad_token = tokenizer.eos_token\n", + "tokenizer.padding_side = \"right\"\n", + "\n", + "# Load model with quantization\n", + "model = AutoModelForCausalLM.from_pretrained(\n", + " model_name,\n", + " quantization_config=bnb_config,\n", + " device_map=\"auto\",\n", + " trust_remote_code=True\n", + ")\n", + "\n", + "# Prepare model for k-bit training\n", + "model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)\n", + "\n", + "# Apply LoRA\n", + "model = get_peft_model(model, peft_config)\n", + "model.print_trainable_parameters()" + ] + }, + { + "cell_type": "markdown", + "id": "dd058fff-f6dd-4cc6-acaf-7e2fa2c1132d", + "metadata": {}, + "source": [ + "## Load and Prepare Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4392ddab-10b7-41f6-a8e0-65ba298ea457", + "metadata": {}, + "outputs": [], + "source": [ + "def format_prompt(example):\n", + " \"\"\"\n", + " Format a dataset example into a standardized prompt-response format for instruction tuning.\n", + " \n", + " This function converts raw dataset examples into a structured format suitable for\n", + " instruction fine-tuning of large language models. The format follows the common\n", + " pattern used for instruction-following tasks with clear section demarcation.\n", + " \n", + " Args:\n", + " example (dict): A dictionary containing the example data with keys:\n", + " - 'instruction': The task instruction\n", + " - 'input': The optional input context (may be empty)\n", + " - 'output': The expected output/response\n", + " \n", + " Returns:\n", + " str: A formatted prompt string with instruction, optional input, and response\n", + " \"\"\"\n", + " if example[\"input\"]:\n", + " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n", + "\n", + "### Instruction:\n", + "{example['instruction']}\n", + "\n", + "### Input:\n", + "{example['input']}\n", + "\n", + "### Response:\n", + "{example['output']}\"\"\"\n", + " else:\n", + " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n", + "\n", + "### Instruction:\n", + "{example['instruction']}\n", + "\n", + "### Response:\n", + "{example['output']}\"\"\"\n", + "\n", + "# Load dataset from JSON file (contains mathematical question-answer pairs)\n", + "dataset = load_dataset(\"json\", data_files=dataset_name, split=\"train\", num_proc=4)\n", + "\n", + "# Transform raw examples into formatted text for instruction tuning\n", + "dataset = dataset.map(lambda x: {\"text\": format_prompt(x)}, num_proc=4)\n", + "\n", + "# Split dataset into training (90%) and evaluation (10%) sets\n", + "dataset = dataset.train_test_split(test_size=0.1)\n", + "train_dataset = dataset[\"train\"]\n", + "eval_dataset = dataset[\"test\"]" + ] + }, + { + "cell_type": "markdown", + "id": "812cfcc8-33ec-4a2b-8a74-27bfc2a41d7b", + "metadata": {}, + "source": [ + "## Enhanced Training with SFTTrainer" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "6dc85c57-68b2-4514-9373-43e3d7c05c10", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': dataset_text_field, max_seq_length, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "Generating train split: 1820 examples [00:02, 613.71 examples/s]\n", + "Generating train split: 209 examples [00:00, 582.95 examples/s]\n" + ] + } + ], + "source": [ + "trainer = SFTTrainer(\n", + " model=model,\n", + " train_dataset=train_dataset,\n", + " eval_dataset=eval_dataset,\n", + " peft_config=peft_config,\n", + " dataset_text_field=\"text\",\n", + " max_seq_length=1024,\n", + " tokenizer=tokenizer,\n", + " args=training_args,\n", + " packing=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "810eb75e", + "metadata": {}, + "source": [ + "## Federated Averaging Function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58298e8e-ab9e-4377-966e-143823441697", + "metadata": {}, + "outputs": [], + "source": [ + "def FedAvg(peft_params, model, weights=None):\n", + " \"\"\"\n", + " Perform Federated Averaging (FedAvg) on the model parameters.\n", + " \n", + " This function aggregates PEFT parameters from multiple collaborators using weighted\n", + " averaging. It handles the complex task of averaging parameters while maintaining \n", + " the correct tensor types and shapes required by the PEFT framework.\n", + " \n", + " Args:\n", + " peft_params (list): A list of state dictionaries containing PEFT parameters from different collaborators.\n", + " model (torch.nn.Module): The base model to which the averaged parameters will be applied.\n", + " weights (list, optional): A list of weights for averaging the parameters. If None, equal weights are used.\n", + " Weights determine the contribution of each collaborator to the final model.\n", + " \n", + " Returns:\n", + " torch.nn.Module: The model with the averaged parameters applied.\n", + " \n", + " Notes:\n", + " The function converts tensors to float for averaging to avoid precision issues,\n", + " then converts back to the original data type for model compatibility.\n", + " \"\"\"\n", + " # Store the state dictionaries for easy access\n", + " state_dicts = peft_params\n", + " # Get the current state dict from the model as a template\n", + " state_dict = get_peft_model_state_dict(model)\n", + " \n", + " # Iterate through each parameter in the first state dict as reference\n", + " for key in peft_params[0]:\n", + " # Store original data type for later conversion\n", + " dtype = state_dicts[0][key].dtype\n", + " \n", + " # Convert all tensors to float, move to CPU, perform weighted average\n", + " state_dict[key] = torch.from_numpy(\n", + " np.average(\n", + " [state[key].cpu().to(torch.float).numpy() for state in state_dicts], \n", + " axis=0, \n", + " weights=weights\n", + " )\n", + " ).to(dtype) # Convert back to original data type\n", + " \n", + " # Apply the averaged parameters back to the model\n", + " set_peft_model_state_dict(model, state_dict)\n", + " return model" + ] + }, + { + "cell_type": "markdown", + "id": "e120a656-f4a5-47a5-a3d4-62c5f3672bba", + "metadata": {}, + "source": [ + "## Federated Learning Workflow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5e108c6-5150-4931-9c01-6b64a913fa04", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Aggregator step \"start\" registered\n", + "Collaborator step \"aggregated_model_validation\" registered\n", + "Collaborator step \"train\" registered\n", + "Collaborator step \"local_model_validation\" registered\n", + "Aggregator step \"join\" registered\n", + "Aggregator step \"end\" registered\n" + ] + } + ], + "source": [ + "# Import the required PrinterCallback for proper initialization/removal\n", + "from transformers.trainer_callback import PrinterCallback\n", + "import transformers\n", + "\n", + "class FederatedFlow(FLSpec):\n", + " \"\"\"\n", + " Federated Learning workflow for fine-tuning Phi-4 model with PEFT and quantization.\n", + " \n", + " This class implements the complete federated learning workflow for a language model,\n", + " including initialization, aggregated model validation, training, local model validation,\n", + " and parameter aggregation. It uses Parameter-Efficient Fine-Tuning (PEFT) with 4-bit\n", + " quantization to efficiently train large language models in memory-constrained environments.\n", + " \n", + " The workflow follows these steps for each round:\n", + " 1. Initialize model on each collaborator\n", + " 2. Validate the aggregated model on local data\n", + " 3. Train the model locally on each collaborator\n", + " 4. Validate the locally trained model\n", + " 5. Aggregate PEFT parameters from all collaborators using FedAvg\n", + " 6. Repeat for specified number of rounds\n", + " \n", + " Attributes:\n", + " model: The base language model being fine-tuned\n", + " peft_params: PEFT parameters dictionary for the model\n", + " optimizer: Optimizer for training (optional)\n", + " rounds: Number of federated learning rounds to perform\n", + " current_round: Counter for the current round\n", + " collaborators: List of collaborators participating in federated learning\n", + " \"\"\"\n", + " def __init__(self, model=None, optimizer=None, rounds=3, **kwargs):\n", + " \"\"\"\n", + " Initialize the federated learning workflow.\n", + " \n", + " Args:\n", + " model: The base language model to fine-tune. Must be provided.\n", + " optimizer: Optional optimizer for model training.\n", + " rounds: Number of federated learning rounds to perform (default: 3).\n", + " **kwargs: Additional arguments passed to the parent class.\n", + " \n", + " Raises:\n", + " ValueError: If no model is provided.\n", + " \"\"\"\n", + " super().__init__(**kwargs)\n", + " if model is not None:\n", + " self.model = model\n", + " self.peft_params = get_peft_model_state_dict(self.model)\n", + " self.optimizer = optimizer\n", + " else:\n", + " raise ValueError(\"No model inputted\")\n", + "\n", + " self.rounds = rounds\n", + " \n", + "\n", + " @aggregator\n", + " def start(self):\n", + " \"\"\"\n", + " Start the federated learning process on the aggregator.\n", + " \n", + " This method initializes the workflow by:\n", + " 1. Setting up the list of collaborators from the runtime\n", + " 2. Initializing the current round counter\n", + " 3. Starting the first step of the workflow by sending the model\n", + " to all collaborators for validation\n", + " \n", + " The @aggregator decorator ensures this method runs on the aggregator node.\n", + " \"\"\"\n", + " print(f\"Performing initialization for model\")\n", + " self.collaborators = self.runtime.collaborators\n", + " self.current_round = 0\n", + " # Start the workflow by sending the model to all collaborators\n", + " self.next(\n", + " self.aggregated_model_validation,\n", + " foreach=\"collaborators\",\n", + " )\n", + "\n", + " \n", + " @collaborator\n", + " def aggregated_model_validation(self):\n", + " \"\"\"\n", + " Validate the aggregated model on each collaborator's local dataset.\n", + " \n", + " This method:\n", + " 1. Loads the model with appropriate quantization configuration\n", + " 2. Applies the PEFT configuration and parameters\n", + " 3. Creates a trainer with local validation dataset\n", + " 4. Evaluates the model and records the validation loss\n", + " 5. Transitions to the training phase\n", + " \n", + " The @collaborator decorator ensures this method runs on each collaborator node.\n", + " \n", + " Notes:\n", + " Includes fallback to CPU if GPU memory is insufficient\n", + " \"\"\"\n", + " print(f\"Performing aggregated model validation for collaborator {self.input}\")\n", + " # Load model with quantization and CPU offloading if needed\n", + " device_map = \"auto\" \n", + " try:\n", + " # Try to load model on GPU with quantization\n", + " self.model = AutoModelForCausalLM.from_pretrained(\n", + " model_name,\n", + " quantization_config=bnb_config,\n", + " device_map=device_map,\n", + " #max_memory={0: \"4GiB\", \"cpu\": \"24GiB\"},\n", + " trust_remote_code=True\n", + " )\n", + " except ValueError:\n", + " # Fallback to CPU if GPU memory is insufficient\n", + " print(f\"Falling back to CPU mode for {self.input}\")\n", + " self.model = AutoModelForCausalLM.from_pretrained(\n", + " model_name,\n", + " device_map=\"cpu\",\n", + " trust_remote_code=True\n", + " )\n", + " \n", + " # Prepare model for training with quantization\n", + " self.model = prepare_model_for_kbit_training(self.model)\n", + " # Apply PEFT configuration (LoRA)\n", + " self.model = get_peft_model(self.model, peft_config)\n", + " # Load aggregated parameters\n", + " set_peft_model_state_dict(self.model, self.peft_params)\n", + " \n", + " # Setup trainer for evaluation\n", + " trainer = SFTTrainer(\n", + " model=self.model,\n", + " args=training_args,\n", + " peft_config=peft_config,\n", + " train_dataset=self.train_dataset,\n", + " eval_dataset=self.eval_dataset,\n", + " max_seq_length=1024,\n", + " dataset_text_field=\"text\",\n", + " tokenizer=tokenizer,\n", + " packing=True,\n", + " data_collator=transformers.DataCollatorForSeq2Seq(\n", + " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n", + " ),\n", + " )\n", + "\n", + " # Remove default printer callback to avoid verbose output\n", + " trainer.remove_callback(PrinterCallback)\n", + " # Evaluate model and store metrics\n", + " out = trainer.evaluate()\n", + " self.agg_validation_score = out[\"eval_loss\"]\n", + " print(f\"{self.input} value of {self.agg_validation_score}\")\n", + " # Move to training phase\n", + " self.next(self.train)\n", + "\n", + " @collaborator\n", + " def train(self):\n", + " \"\"\"\n", + " Train the model on each collaborator's local dataset.\n", + " \n", + " This method:\n", + " 1. Creates an SFTTrainer with the local training dataset\n", + " 2. Runs the training process\n", + " 3. Records the training loss\n", + " 4. Saves the trained model\n", + " 5. Transitions to local validation phase\n", + " \n", + " The @collaborator decorator ensures this method runs on each collaborator node.\n", + " \"\"\"\n", + " # Setup trainer for local training\n", + " trainer = SFTTrainer(\n", + " model=self.model,\n", + " args=training_args,\n", + " peft_config=peft_config,\n", + " train_dataset=self.train_dataset,\n", + " eval_dataset=self.eval_dataset,\n", + " max_seq_length=1024,\n", + " dataset_text_field=\"text\",\n", + " tokenizer=tokenizer,\n", + " packing=True,\n", + " data_collator=transformers.DataCollatorForSeq2Seq(\n", + " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n", + " ),\n", + " )\n", + "\n", + " # Execute training\n", + " out = trainer.train()\n", + " # Store training loss for later analysis\n", + " self.loss = out.training_loss\n", + " # Save locally trained model\n", + " trainer.save_model()\n", + " self.training_completed = True\n", + " # Move to local validation phase\n", + " self.next(self.local_model_validation)\n", + "\n", + " @collaborator\n", + " def local_model_validation(self):\n", + " \"\"\"\n", + " Validate the locally trained model on each collaborator's validation dataset.\n", + " \n", + " This method:\n", + " 1. Creates an SFTTrainer with the local validation dataset\n", + " 2. Evaluates the locally trained model\n", + " 3. Records the validation loss\n", + " 4. Extracts the PEFT parameters for aggregation\n", + " 5. Sends results to the aggregator for parameter aggregation\n", + " \n", + " The @collaborator decorator ensures this method runs on each collaborator node.\n", + " \n", + " Notes:\n", + " Excludes the full model and training flags from the data sent to the aggregator\n", + " to reduce communication overhead\n", + " \"\"\"\n", + " # Setup trainer for evaluation\n", + " trainer = SFTTrainer(\n", + " model=self.model,\n", + " args=training_args,\n", + " peft_config=peft_config,\n", + " train_dataset=self.train_dataset,\n", + " eval_dataset=self.eval_dataset,\n", + " max_seq_length=1024,\n", + " dataset_text_field=\"text\",\n", + " tokenizer=tokenizer,\n", + " packing=True,\n", + " data_collator=transformers.DataCollatorForSeq2Seq(\n", + " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n", + " ),\n", + " )\n", + " # Evaluate the locally trained model\n", + " out = trainer.evaluate()\n", + " self.local_validation_score = out[\"eval_loss\"]\n", + " # Extract PEFT parameters for aggregation\n", + " self.peft_params = get_peft_model_state_dict(self.model)\n", + " print(f\"Doing local model validation for collaborator {self.input}\")\n", + " # Send results to aggregator, excluding the full model and training flags\n", + " # to reduce communication overhead\n", + " self.next(self.join, exclude=[\"training_completed\", \"model\"])\n", + "\n", + " @aggregator\n", + " def join(self, inputs):\n", + " \"\"\"\n", + " Aggregate results from all collaborators and update the global model.\n", + " \n", + " This method:\n", + " 1. Calculates average loss, aggregated model accuracy, and local model accuracy\n", + " 2. Updates the global model using Federated Averaging (FedAvg)\n", + " 3. Saves the aggregated model and tokenizer\n", + " 4. Either starts the next round or ends the workflow depending on round count\n", + " \n", + " Args:\n", + " inputs: List of data objects from all collaborators containing validation scores\n", + " and PEFT parameters.\n", + " \n", + " The @aggregator decorator ensures this method runs on the aggregator node.\n", + " \"\"\"\n", + " # Calculate average metrics across all collaborators\n", + " self.average_loss = sum(input.loss for input in inputs) / len(inputs)\n", + " self.aggregated_model_accuracy = sum(\n", + " input.agg_validation_score for input in inputs\n", + " ) / len(inputs)\n", + " self.local_model_accuracy = sum(\n", + " input.local_validation_score for input in inputs\n", + " ) / len(inputs)\n", + " \n", + " # Display aggregated metrics\n", + " print(\n", + " f\"Average aggregated model validation values = {self.aggregated_model_accuracy}\"\n", + " )\n", + " print(f\"Average training loss = {self.average_loss}\")\n", + " print(f\"Average local model validation values = {self.local_model_accuracy}\")\n", + "\n", + " # Perform federated averaging of model parameters\n", + " self.model = FedAvg([input.peft_params for input in inputs], self.model)\n", + " self.peft_params = get_peft_model_state_dict(self.model)\n", + "\n", + " # Save the aggregated model for future use\n", + " self.model.save_pretrained(\"./aggregated/model\")\n", + " tokenizer.save_pretrained(\"./aggregated/tokenizer\")\n", + " \n", + " # Increment round counter and start next round or end workflow\n", + " self.current_round += 1\n", + " if self.current_round < self.rounds:\n", + " self.next(\n", + " self.aggregated_model_validation,\n", + " foreach=\"collaborators\",\n", + " exclude=[\"model\"],\n", + " )\n", + " else:\n", + " self.next(self.end)\n", + "\n", + " @aggregator\n", + " def end(self):\n", + " \"\"\"\n", + " End the federated learning process.\n", + " \n", + " This method marks the end of the federated learning workflow after all rounds\n", + " have been completed. The final aggregated model and tokenizer are already saved\n", + " in the last join step.\n", + " \n", + " The @aggregator decorator ensures this method runs on the aggregator node.\n", + " \"\"\"\n", + " print(f\"This is the end of the flow\")" + ] + }, + { + "cell_type": "markdown", + "id": "7bc8fe27", + "metadata": {}, + "source": [ + "## Run Federated Learning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38894111-41d9-4dd4-b1c8-eb7ec3cdd3e1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling start\n", + "\u001b[94mPerforming initialization for model\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94mPerforming aggregated model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.30it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "Generating train split: 913 examples [00:01, 623.08 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "Generating train split: 104 examples [00:00, 583.62 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland value of 0.5918120741844177\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "Generating train split: 913 examples [00:01, 616.37 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "Generating train split: 104 examples [00:00, 615.85 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "\u001b[0m`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [228/228 08:54, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
100.516900
200.373400
300.346100
400.339100
500.333000
600.323700
700.329800
800.312800
900.326000
1000.306800
1100.314900
1200.328300
1300.311300
1400.313400
1500.315400
1600.312200
1700.303000
1800.307400
1900.307600
2000.312500
2100.307000
2200.308000

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94mPerforming aggregated model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.32it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "Generating train split: 907 examples [00:01, 626.09 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "Generating train split: 105 examples [00:00, 634.41 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [14/14 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle value of 0.589488685131073\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [227/227 08:53, Epoch 1/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
100.500700
200.392300
300.364500
400.327800
500.342000
600.310900
700.318500
800.317900
900.333300
1000.321300
1100.312500
1200.301500
1300.314000
1400.317100
1500.316800
1600.318300
1700.318000
1800.295800
1900.311000
2000.310900
2100.327200
2200.311600

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [14/14 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94mAverage aggregated model validation values = 0.5906503796577454\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mAverage training loss = 0.3295206361469617\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mAverage local model validation values = 0.3146952837705612\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94mPerforming aggregated model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.33it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland value of 0.31504756212234497\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [228/228 08:57, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
100.314000
200.292000
300.287100
400.288900
500.283600
600.281300
700.290200
800.277900
900.291600
1000.278400
1100.285700
1200.302800
1300.291500
1400.295600
1500.299100
1600.298400
1700.291200
1800.297700
1900.298500
2000.305400
2100.301000
2200.303100

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94mPerforming aggregated model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.30it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [14/14 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle value of 0.31057578325271606\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [227/227 08:50, Epoch 1.00/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
100.300900
200.307900
300.303400
400.274300
500.295200
600.270700
700.280300
800.284600
900.298700
1000.290900
1100.284300
1200.277500
1300.292400
1400.299300
1500.298800
1600.305300
1700.304600
1800.286900
1900.302600
2000.305300
2100.320000
2200.306800

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Setup federated learning participants\n", + "aggregator = Aggregator() # Central coordinator that aggregates model updates\n", + "collaborators = [\n", + " Collaborator(name=\"Portland\"), # First participant with local dataset\n", + " Collaborator(name=\"Seattle\") # Second participant with local dataset\n", + "]\n", + "\n", + "# Distribute data shards to collaborators (simulating data silos)\n", + "# Each collaborator gets a non-overlapping portion of the dataset\n", + "for idx, colab in enumerate(collaborators):\n", + " colab.private_attributes = {\n", + " \"train_dataset\": train_dataset.shard(len(collaborators), idx), # Training shard\n", + " \"eval_dataset\": eval_dataset.shard(len(collaborators), idx) # Evaluation shard\n", + " }\n", + "\n", + "# Set up and execute the federated learning workflow\n", + "runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators) # Local simulation runtime\n", + "flflow = FederatedFlow(model, rounds=2) # Create flow with 2 federated learning rounds\n", + "flflow.runtime = runtime # Assign runtime to the flow\n", + "flflow.run() # Start the federated learning process" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (myenv)", + "language": "python", + "name": "myenv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4.ipynb deleted file mode 100644 index 0c6884f384..0000000000 --- a/openfl-tutorials/experimental/workflow/LLM/phi-4.ipynb +++ /dev/null @@ -1,705 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "a59f475d-d843-46bc-b75e-10984b687ed3", - "metadata": {}, - "source": [ - "# Federated Fine-Tuning of Phi-4 Using OpenFL" - ] - }, - { - "cell_type": "markdown", - "id": "20c74cb9-51a2-42e2-893f-d280e227e8bf", - "metadata": {}, - "source": [ - "\n", - "In this tutorial, we demonstrate how to fine-tune Microsoft's Phi-4 model in a federated learning workflow.\n", - "\n", - "We will fine-tune **Microsoft's [Phi4](https://huggingface.co/microsoft/phi-4)** model using a diverse dataset such as [Math_10k](https://github.com/AGI-Edgerunners/LLM-Adapters/tree/main), an open-source dataset containing mathematical question-answer pairs collected from various smaller math datasets." - ] - }, - { - "cell_type": "markdown", - "id": "d07c32d3-1a8d-4162-af45-bc3a10e0ae3f", - "metadata": {}, - "source": [ - "## The Workflow Interface" - ] - }, - { - "cell_type": "markdown", - "id": "e3d74610-e48d-4dd4-b622-eb910fbe91aa", - "metadata": {}, - "source": [ - "The workflow interface is an innovative approach to designing federated learning experiments with OpenFL. It was developed in response to discussions with researchers and users who had unique use cases that didn’t perfectly align with the traditional horizontal federated learning model. This interface enables more flexible compositions of experiments, allowing for greater customization and adaptability in complex, real-world scenarios" - ] - }, - { - "cell_type": "markdown", - "id": "413e1d95-fd76-4fe0-b8d0-4c625c2a8fd3", - "metadata": {}, - "source": [ - "## Installing OpenFL\n", - "To install OpenFL, follow the official documentation: \n", - "[OpenFL Installation Guide](https://openfl.readthedocs.io/en/latest/installation.html)" - ] - }, - { - "cell_type": "markdown", - "id": "53654c70", - "metadata": {}, - "source": [ - "After installation, activate experimental APIs using: \n", - "`fx experimental activate`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "05b2ad75-8c7b-499c-902e-dbd5b24361bc", - "metadata": {}, - "outputs": [], - "source": [ - "# Install dependencies \n", - "!pip install torch transformers peft datasets trl==0.12.2 -q" - ] - }, - { - "cell_type": "markdown", - "id": "440a9c39-ec42-45a5-80f6-9a9e0bc90d2f", - "metadata": {}, - "source": [ - "## Import libraries" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "be4690ae-0671-4d3a-8f21-620ab865a03e", - "metadata": {}, - "outputs": [], - "source": [ - "import hashlib\n", - "import os\n", - "\n", - "import numpy as np\n", - "import requests\n", - "import torch\n", - "import transformers\n", - "from datasets import load_dataset\n", - "from peft import LoraConfig, get_peft_model\n", - "from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict\n", - "from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments\n", - "from transformers.trainer_callback import PrinterCallback\n", - "from trl import SFTTrainer\n", - "\n", - "from openfl.experimental.workflow.interface import Aggregator, Collaborator, FLSpec\n", - "from openfl.experimental.workflow.placement import aggregator, collaborator\n", - "from openfl.experimental.workflow.runtime import LocalRuntime" - ] - }, - { - "cell_type": "markdown", - "id": "08576aa0-f628-4ae6-8fc3-dd167d164784", - "metadata": {}, - "source": [ - "## Acquiring and preprocessing dataset" - ] - }, - { - "cell_type": "markdown", - "id": "7ba1d8b6-8a5b-41a2-8c77-c9a85e869cda", - "metadata": {}, - "source": [ - "We can download the dataset directly from the [LLM-Adapters repository](https://github.com/AGI-Edgerunners/LLM-Adapters)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d615d626-8727-4169-b2a6-3ba15c3cdb95", - "metadata": {}, - "outputs": [], - "source": [ - "def file_checksum(file_path, algorithm=\"sha256\"):\n", - " \"\"\"\n", - " Calculate the checksum of a file using the specified hashing algorithm.\n", - "\n", - " Parameters:\n", - " file_path (str): The path to the file for which the checksum is to be calculated.\n", - " algorithm (str): The hashing algorithm to use (default is 'sha256').\n", - "\n", - " Returns:\n", - " str: The calculated checksum of the file.\n", - " \"\"\"\n", - " hash_func = hashlib.new(algorithm)\n", - " with open(file_path, \"rb\") as f:\n", - " for chunk in iter(lambda: f.read(4096), b\"\"):\n", - " hash_func.update(chunk)\n", - " return hash_func.hexdigest()\n", - "\n", - "\n", - "if not os.path.exists(\"math_10k.json\"):\n", - " r = requests.get(\n", - " \"https://raw.githubusercontent.com/AGI-Edgerunners/LLM-Adapters/main/ft-training_set/math_10k.json\",\n", - " )\n", - " with open(\n", - " \"math_10k.json\",\n", - " \"wb\",\n", - " ) as f:\n", - " f.write(r.content)\n", - "\n", - " actual_checksum = file_checksum(\"math_10k.json\")\n", - " if (\n", - " actual_checksum\n", - " != \"0342d0d860ad8592b579329337c90e42eefd3d9f2898043140cbd120630418b8\"\n", - " ):\n", - " raise ValueError(\n", - " \"Checksum verification failed. The file may have been altered.\"\n", - " )\n", - "\n", - "raw_dataset = load_dataset(\"json\", data_files=\"math_10k.json\")" - ] - }, - { - "cell_type": "markdown", - "id": "3ab15ad6-db35-4a58-a2d5-54a6d3ccdc78", - "metadata": {}, - "source": [ - "## Initialize arguments and configurations" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eada9809-468a-47c6-9b03-55aa887c9487", - "metadata": {}, - "outputs": [], - "source": [ - "training_config = {\n", - " \"bf16\": True,\n", - " \"use_cpu\": True,\n", - " \"do_eval\": False,\n", - " \"learning_rate\": 5.0e-06,\n", - " \"log_level\": \"info\",\n", - " \"logging_steps\": 20,\n", - " \"lr_scheduler_type\": \"cosine\",\n", - " \"num_train_epochs\": 1,\n", - " \"output_dir\": \"./checkpoint_dir\",\n", - " \"overwrite_output_dir\": True,\n", - " \"per_device_eval_batch_size\": 1,\n", - " \"per_device_train_batch_size\": 1,\n", - " \"save_steps\": 100,\n", - " \"save_total_limit\": 1,\n", - " \"seed\": 0,\n", - " \"gradient_checkpointing\": True,\n", - " \"gradient_checkpointing_kwargs\": {\"use_reentrant\": False},\n", - " \"warmup_ratio\": 0.2,\n", - "}\n", - "\n", - "peft_config = {\n", - " \"r\": 1,\n", - " \"lora_alpha\": 2,\n", - " \"lora_dropout\": 0.05,\n", - " \"bias\": \"none\",\n", - " \"task_type\": \"CAUSAL_LM\",\n", - " \"target_modules\": \"all-linear\",\n", - " \"modules_to_save\": None,\n", - "}\n", - "model_kwargs = dict(\n", - " use_cache=False,\n", - " trust_remote_code=True,\n", - " torch_dtype=torch.bfloat16,\n", - " device_map=None,\n", - ")\n", - "train_conf = TrainingArguments(**training_config)\n", - "peft_conf = LoraConfig(**peft_config)" - ] - }, - { - "cell_type": "markdown", - "id": "ffe93234-2a1a-4809-a431-efe2f35ce496", - "metadata": {}, - "source": [ - "## Load and initialize model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8ab371f1-64c3-4225-82e7-fb3c5b05578c", - "metadata": {}, - "outputs": [], - "source": [ - "checkpoint_path = \"NyxKrage/Microsoft_Phi-4\"\n", - "model = AutoModelForCausalLM.from_pretrained(\n", - " checkpoint_path, return_dict=True, **model_kwargs\n", - ")\n", - "model = get_peft_model(model, peft_conf)\n", - "\n", - "tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)\n", - "sequence_max_length = 512\n", - "val_set_size = 2000\n", - "tokenizer.pad_token_id = 0 # we want this to be different from the eos token\n", - "tokenizer.padding_side = \"left\" # Allow batched inference" - ] - }, - { - "cell_type": "markdown", - "id": "dd058fff-f6dd-4cc6-acaf-7e2fa2c1132d", - "metadata": {}, - "source": [ - "## Preprocess dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4392ddab-10b7-41f6-a8e0-65ba298ea457", - "metadata": {}, - "outputs": [], - "source": [ - "def generate_prompt(data_point):\n", - " \"\"\"\n", - " Generate a prompt based on the given data point.\n", - "\n", - " Parameters:\n", - " data_point (dict): A dictionary containing the instruction, input, and output.\n", - "\n", - " Returns:\n", - " str: The generated prompt as a string.\n", - " \"\"\"\n", - " if data_point[\"input\"]:\n", - " return f\"\"\"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. \n", - "\n", - " ### Instruction:\n", - " {data_point[\"instruction\"]}\n", - " \n", - " ### Input:\n", - " {data_point[\"input\"]}\n", - " \n", - " ### Response:\n", - " {data_point[\"output\"]}\"\"\"\n", - " else:\n", - " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request. \n", - "\n", - " ### Instruction:\n", - " {data_point[\"instruction\"]}\n", - " \n", - " ### Response:\n", - " {data_point[\"output\"]}\"\"\"\n", - "\n", - "\n", - "def tokenize(prompt, add_eos_token=True):\n", - " \"\"\"\n", - " Tokenize the given prompt.\n", - "\n", - " Parameters:\n", - " prompt (str): The prompt to be tokenized.\n", - " add_eos_token (bool): Whether to add an end-of-sequence token (default is True).\n", - "\n", - " Returns:\n", - " dict: A dictionary containing the tokenized input IDs and attention mask.\n", - " \"\"\"\n", - " result = tokenizer(\n", - " prompt,\n", - " truncation=True,\n", - " max_length=sequence_max_length,\n", - " padding=False,\n", - " return_tensors=None,\n", - " )\n", - " if (\n", - " result[\"input_ids\"][-1] != tokenizer.eos_token_id\n", - " and len(result[\"input_ids\"]) < sequence_max_length\n", - " and add_eos_token\n", - " ):\n", - " result[\"input_ids\"].append(tokenizer.eos_token_id)\n", - " result[\"attention_mask\"].append(1)\n", - "\n", - " result[\"labels\"] = result[\"input_ids\"].copy()\n", - "\n", - " return result\n", - "\n", - "\n", - "def generate_and_tokenize_prompt(data_point):\n", - " \"\"\"\n", - " Generate and tokenize a prompt based on the given data point.\n", - "\n", - " Parameters:\n", - " data_point (dict): A dictionary containing the instruction, input, and output.\n", - "\n", - " Returns:\n", - " dict: A dictionary containing the tokenized input IDs, attention mask, and labels.\n", - " \"\"\"\n", - " full_prompt = generate_prompt(data_point)\n", - " tokenized_full_prompt = tokenize(full_prompt)\n", - " user_prompt = generate_prompt({**data_point, \"output\": \"\"})\n", - " tokenized_user_prompt = tokenize(user_prompt, add_eos_token=False)\n", - " user_prompt_len = len(tokenized_user_prompt[\"input_ids\"])\n", - "\n", - " tokenized_full_prompt[\"labels\"] = [-100] * user_prompt_len + tokenized_full_prompt[\n", - " \"labels\"\n", - " ][user_prompt_len:]\n", - " return tokenized_full_prompt\n", - "\n", - "\n", - "train_val = raw_dataset[\"train\"].train_test_split(\n", - " test_size=val_set_size, shuffle=True, seed=42\n", - ")\n", - "\n", - "processed_train_dataset = train_val[\"train\"].shuffle().map(generate_and_tokenize_prompt).select(range(3))\n", - "processed_test_dataset = train_val[\"test\"].shuffle().map(generate_and_tokenize_prompt).select(range(3))" - ] - }, - { - "cell_type": "markdown", - "id": "812cfcc8-33ec-4a2b-8a74-27bfc2a41d7b", - "metadata": {}, - "source": [ - "## Define Federated Averaging Method\n", - "The FedAvg method is used to average the models from all the collaborators after training." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6dc85c57-68b2-4514-9373-43e3d7c05c10", - "metadata": {}, - "outputs": [], - "source": [ - "def FedAvg(peft_params, model, weights=None):\n", - " \"\"\"\n", - " Perform Federated Averaging (FedAvg) on the model parameters.\n", - "\n", - " Parameters:\n", - " peft_params (list): A list of state dictionaries containing the model parameters from different clients.\n", - " model (torch.nn.Module): The model to which the averaged parameters will be applied.\n", - " weights (list, optional): A list of weights for averaging the parameters. If None, equal weights are used.\n", - "\n", - " Returns:\n", - " torch.nn.Module: The model with the averaged parameters applied.\n", - " \"\"\"\n", - " state_dicts = peft_params\n", - " state_dict = get_peft_model_state_dict(model)\n", - " for key in peft_params[0]:\n", - " dtype = state_dicts[0][key].dtype\n", - " state_dict[key] = torch.from_numpy(\n", - " np.average(\n", - " [state[key].to(torch.float).numpy() for state in state_dicts], axis=0, weights=weights\n", - " )\n", - " ).to(dtype)\n", - " set_peft_model_state_dict(model, state_dict)\n", - " return model" - ] - }, - { - "cell_type": "markdown", - "id": "810eb75e", - "metadata": {}, - "source": [ - "Now we come to the flow definition. The OpenFL Workflow Interface adopts the conventions set by Metaflow, that every workflow begins with `start` and concludes with the `end` task. The aggregator begins with an optionally passed in model and optimizer. The aggregator begins the flow with the `start` task, where the list of collaborators is extracted from the runtime (`self.collaborators = self.runtime.collaborators`) and is then used as the list of participants to run the task listed in `self.next`, `aggregated_model_validation`. The model, optimizer, and anything that is not explicitly excluded from the next function will be passed from the `start` function on the aggregator to the `aggregated_model_validation` task on the collaborator. Where the tasks run is determined by the placement decorator that precedes each task definition (`@aggregator` or `@collaborator`). Once each of the collaborators (defined in the runtime) complete the `aggregated_model_validation` task, they pass their current state onto the `train` task, from `train` to `local_model_validation`, and then finally to `join` at the aggregator. It is in `join` that an average is taken of the model weights, and the next round can begin.\n", - "\n", - "![Workflow Interface](../../../../docs/images/workflow_interface.png)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "58298e8e-ab9e-4377-966e-143823441697", - "metadata": {}, - "outputs": [], - "source": [ - "class FederatedFlow(FLSpec):\n", - " def __init__(self, model=None, optimizer=None, rounds=3, **kwargs):\n", - " \"\"\"\n", - " Initialize the class with the given model, optimizer, and number of rounds.\n", - "\n", - " Parameters:\n", - " model (torch.nn.Module, optional): The model to be used. If None, a ValueError is raised.\n", - " optimizer (torch.optim.Optimizer, optional): The optimizer to be used.\n", - " rounds (int, optional): The number of rounds for training or processing (default is 3).\n", - " **kwargs: Additional keyword arguments to be passed to the superclass initializer.\n", - "\n", - " Raises:\n", - " ValueError: If no model is provided.\n", - " \"\"\"\n", - " super().__init__(**kwargs)\n", - " if model is not None:\n", - " self.model = model\n", - " self.peft_params = get_peft_model_state_dict(self.model)\n", - " self.optimizer = optimizer\n", - " else:\n", - " raise ValueError(\"No model inputted\")\n", - "\n", - " self.rounds = rounds\n", - " \n", - "\n", - " @aggregator\n", - " def start(self):\n", - " \"\"\"\n", - " Initialize the model and set up the collaborators for federated learning.\n", - "\n", - " This method performs the initial setup for the model, including setting the\n", - " collaborators, initializing private variables, and starting the first round\n", - " of the federated learning process.\n", - " \"\"\"\n", - " print(f\"Performing initialization for model\")\n", - " self.collaborators = self.runtime.collaborators\n", - " self.current_round = 0\n", - " self.next(\n", - " self.aggregated_model_validation,\n", - " foreach=\"collaborators\",\n", - " )\n", - "\n", - " \n", - " @collaborator\n", - " def aggregated_model_validation(self):\n", - " \"\"\"\n", - " Perform aggregated model validation for a collaborator.\n", - "\n", - " This method loads the model, applies the PEFT configuration, and evaluates\n", - " the model using the provided training and evaluation datasets. The validation\n", - " score is then stored and the next step in the process is triggered.\n", - " \"\"\"\n", - " print(f\"Performing aggregated model validation for collaborator {self.input}\")\n", - " self.model = AutoModelForCausalLM.from_pretrained(\n", - " checkpoint_path, return_dict=True, **model_kwargs\n", - " )\n", - " self.model = get_peft_model(self.model, peft_conf)\n", - " set_peft_model_state_dict(self.model, self.peft_params)\n", - " trainer = SFTTrainer(\n", - " model=self.model,\n", - " args=train_conf,\n", - " peft_config=peft_conf,\n", - " train_dataset=self.train_dataset,\n", - " eval_dataset=self.eval_dataset,\n", - " max_seq_length=sequence_max_length,\n", - " dataset_text_field=\"text\",\n", - " tokenizer=tokenizer,\n", - " packing=True,\n", - " data_collator=transformers.DataCollatorForSeq2Seq(\n", - " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n", - " ),\n", - " )\n", - "\n", - " trainer.remove_callback(PrinterCallback)\n", - " out = trainer.evaluate()\n", - " self.agg_validation_score = out[\"eval_loss\"]\n", - " print(f\"{self.input} value of {self.agg_validation_score}\")\n", - " self.next(self.train)\n", - "\n", - " @collaborator\n", - " def train(self):\n", - " \"\"\"\n", - " Train the model for a collaborator.\n", - "\n", - " This method trains the model using the provided training and evaluation datasets.\n", - " The training loss is stored, the model is saved, and the next step in the process\n", - " is triggered.\n", - " \"\"\"\n", - " trainer = SFTTrainer(\n", - " model=self.model,\n", - " args=train_conf,\n", - " peft_config=peft_conf,\n", - " train_dataset=self.train_dataset,\n", - " eval_dataset=self.eval_dataset,\n", - " max_seq_length=sequence_max_length,\n", - " dataset_text_field=\"text\",\n", - " tokenizer=tokenizer,\n", - " packing=True,\n", - " data_collator=transformers.DataCollatorForSeq2Seq(\n", - " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n", - " ),\n", - " )\n", - "\n", - " out = trainer.train()\n", - " self.loss = out.training_loss\n", - " trainer.save_model()\n", - " self.training_completed = True\n", - " self.next(self.local_model_validation)\n", - "\n", - " @collaborator\n", - " def local_model_validation(self):\n", - " \"\"\"\n", - " Perform local model validation for a collaborator.\n", - "\n", - " This method evaluates the model using the provided training and evaluation datasets.\n", - " The validation score is stored, the PEFT parameters are updated, and the next step\n", - " in the process is triggered.\n", - " \"\"\"\n", - " trainer = SFTTrainer(\n", - " model=self.model,\n", - " args=train_conf,\n", - " peft_config=peft_conf,\n", - " train_dataset=processed_train_dataset,\n", - " eval_dataset=processed_test_dataset,\n", - " max_seq_length=sequence_max_length,\n", - " dataset_text_field=\"text\",\n", - " tokenizer=tokenizer,\n", - " packing=True,\n", - " data_collator=transformers.DataCollatorForSeq2Seq(\n", - " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n", - " ),\n", - " )\n", - " out = trainer.evaluate()\n", - " self.local_validation_score = out[\"eval_loss\"]\n", - " self.peft_params = get_peft_model_state_dict(self.model)\n", - " print(f\"Doing local model validation for collaborator {self.input}\")\n", - " self.next(self.join, exclude=[\"training_completed\", \"model\"])\n", - "\n", - " @aggregator\n", - " def join(self, inputs):\n", - " \"\"\"\n", - " Aggregate the results from all collaborators and update the model.\n", - "\n", - " This method calculates the average loss, aggregated model accuracy, and local model\n", - " accuracy from all collaborators. The model parameters are updated using Federated\n", - " Averaging (FedAvg), and the next round of the process is triggered if applicable.\n", - " \"\"\"\n", - " self.average_loss = sum(input.loss for input in inputs) / len(inputs)\n", - " self.aggregated_model_accuracy = sum(\n", - " input.agg_validation_score for input in inputs\n", - " ) / len(inputs)\n", - " self.local_model_accuracy = sum(\n", - " input.local_validation_score for input in inputs\n", - " ) / len(inputs)\n", - " print(\n", - " f\"Average aggregated model validation values = {self.aggregated_model_accuracy}\"\n", - " )\n", - " print(f\"Average training loss = {self.average_loss}\")\n", - " print(f\"Average local model validation values = {self.local_model_accuracy}\")\n", - "\n", - " self.model = FedAvg([input.peft_params for input in inputs], self.model)\n", - " self.peft_params = get_peft_model_state_dict(self.model)\n", - "\n", - " self.model.save_pretrained(\"./aggregated/model\")\n", - " tokenizer.save_pretrained(\"./aggregated/tokenizer\")\n", - " self.current_round += 1\n", - " if self.current_round < self.rounds:\n", - " self.next(\n", - " self.aggregated_model_validation,\n", - " foreach=\"collaborators\",\n", - " exclude=[\"model\"],\n", - " )\n", - " else:\n", - " self.next(self.end)\n", - "\n", - " @aggregator\n", - " def end(self):\n", - " \"\"\"\n", - " End the federated learning process.\n", - "\n", - " This method marks the end of the federated learning process and performs any\n", - " necessary cleanup or finalization steps.\n", - " \"\"\"\n", - " print(f\"This is the end of the flow\")\n" - ] - }, - { - "cell_type": "markdown", - "id": "e120a656-f4a5-47a5-a3d4-62c5f3672bba", - "metadata": {}, - "source": [ - "You'll notice in the `FederatedFlow` definition above that there were certain attributes that the flow was not initialized with, namely the `train_dataset` and `eval_dataset` for each of the collaborators. These are **private_attributes** that are exposed only through the runtime. Each participant has its own set of private attributes: a dictionary where the key is the attribute name, and the value is the object that will be made accessible through that participant's task.\n", - "\n", - "Below, we segment shards of the Math_10k dataset for **two collaborators**: Portland and Seattle. Each has their own slice of the dataset that's accessible via the `train_dataset` or `eval_dataset` attribute. Note that the private attributes are flexible, and you can choose to pass in a completely different type of object to any of the collaborators or aggregator (with an arbitrary name). These private attributes will always be filtered out of the current state when transferring from collaborator to aggregator, or vice versa." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e5e108c6-5150-4931-9c01-6b64a913fa04", - "metadata": {}, - "outputs": [], - "source": [ - "# Setup participants\n", - "_aggregator = Aggregator()\n", - "_aggregator.private_attributes = {}\n", - "\n", - "# Setup collaborators with private attributes\n", - "collaborator_names = [\n", - " \"Portland\",\n", - " \"Seattle\",\n", - "]\n", - "_collaborators = [Collaborator(name=name) for name in collaborator_names]\n", - "\n", - "for idx, current_collaborator in enumerate(_collaborators):\n", - " # Set the private attributes of the Collaborator to include their specific training and testing data loaders\n", - " current_collaborator.private_attributes = {\n", - " \"train_dataset\": processed_train_dataset.shard(\n", - " num_shards=len(_collaborators), index=idx\n", - " ),\n", - " \"eval_dataset\": processed_test_dataset.shard(\n", - " num_shards=len(_collaborators), index=idx\n", - " ),\n", - " }\n", - "\n", - "local_runtime = LocalRuntime(\n", - " aggregator=_aggregator, collaborators=_collaborators, backend=\"single_process\"\n", - ")\n", - "print(f\"Local runtime collaborators = {local_runtime.collaborators}\")" - ] - }, - { - "cell_type": "markdown", - "id": "9cb61fc0", - "metadata": {}, - "source": [ - "## Run Experiment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "38894111-41d9-4dd4-b1c8-eb7ec3cdd3e1", - "metadata": {}, - "outputs": [], - "source": [ - "flflow = FederatedFlow(model, rounds=2)\n", - "flflow.runtime = local_runtime\n", - "flflow.run()\n", - "\n", - "# Determine the final model accuracy:\n", - "print(f'\\nFinal aggregated model accuracy for {flflow.rounds} rounds of training: {flflow.aggregated_model_accuracy}')" - ] - }, - { - "cell_type": "markdown", - "id": "7bc8fe27", - "metadata": {}, - "source": [ - "## 🎉 Congratulations! 🎉\n", - "\n", - "Now that you've completed this notebook, check out our [other tutorials](https://github.com/securefederatedai/openfl/tree/develop/openfl-tutorials/experimental/)\n", - "\n", - "- Using the LocalRuntime Ray Backend for dedicated GPU access\n", - "- Vertical Federated Learning\n", - "- Model Watermarking\n", - "- Differential Privacy\n", - "- And More!" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 1a9c1f426e5575e54fda983b9bdc530d5408aafe Mon Sep 17 00:00:00 2001 From: Rajith Date: Fri, 16 May 2025 11:07:39 +0530 Subject: [PATCH 27/34] removing unwanted files --- .github/scripts/extract_emails.py | 45 -- .github/scripts/send_email.py | 97 ---- .../workflow/LLM/phi-4-quanti.ipynb | 300 ------------ .../experimental/workflow/LLM/phi-4-sol.ipynb | 438 ------------------ 4 files changed, 880 deletions(-) delete mode 100644 .github/scripts/extract_emails.py delete mode 100644 .github/scripts/send_email.py delete mode 100644 openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb delete mode 100644 openfl-tutorials/experimental/workflow/LLM/phi-4-sol.ipynb diff --git a/.github/scripts/extract_emails.py b/.github/scripts/extract_emails.py deleted file mode 100644 index 70f17b64a6..0000000000 --- a/.github/scripts/extract_emails.py +++ /dev/null @@ -1,45 +0,0 @@ -import re -import os -import sys -import json - -def extract_emails(filepath): - """ - Extract all unique email addresses from the given file. - """ - email_pattern = r'[\w.+-]+@[\w-]+\.[\w.-]+' - unique_emails = set() - - try: - with open(filepath, 'r') as file: - for line in file: - # Skip comment lines that don't contain emails - if line.strip().startswith('#') and '@' not in line: - continue - - # Find all email addresses in the line - emails = re.findall(email_pattern, line) - unique_emails.update(emails) - except Exception as e: - print(f"Error processing {filepath}: {str(e)}", file=sys.stderr) - - return sorted(unique_emails) - -if __name__ == "__main__": - # Check CODEOWNERS in standard locations - codeowners_path = None - for path in ['.github/CODEOWNERS', 'CODEOWNERS', 'docs/CODEOWNERS']: - if os.path.exists(path): - codeowners_path = path - break - - result = { - "emails": [], - "codeowners_path": codeowners_path - } - - if codeowners_path: - emails = extract_emails(codeowners_path) - result["emails"] = emails - - print(json.dumps(result)) diff --git a/.github/scripts/send_email.py b/.github/scripts/send_email.py deleted file mode 100644 index 4c151e30fc..0000000000 --- a/.github/scripts/send_email.py +++ /dev/null @@ -1,97 +0,0 @@ -import os -import smtplib -import logging -import argparse -from email.message import EmailMessage -from email.mime.base import MIMEBase -from email import encoders - -logger = logging.getLogger(__name__) - -def send_email(sender_email: str, to_email: str, subject: str, email_body: str, smtp_user: str, smtp_pwd: str, - smtp_email_server: str, cc_email: str = '', bcc_email: str = '', reply_email: str = '', is_html_body: bool = False, - attachments: str = '') -> None: - - message = EmailMessage() - message["Subject"] = subject - message["From"] = sender_email - if to_email: - to_list = to_email.split(",") - message["To"] = ", ".join(to_list) - if cc_email: - cc_list = cc_email.split(",") - message["Cc"] = ", ".join(cc_list) - if reply_email: - message["Reply-To"] = reply_email - sub_type = 'plain' - if is_html_body: - sub_type = 'html' - message.set_content(email_body, subtype=sub_type) - # Set up attachment if any - if attachments: - for attachment in attachments.split(','): - with open(attachment, 'rb') as attachment_file: - attachment_data = attachment_file.read() - message.add_attachment( - attachment_data, - maintype='application', - subtype='octet-stream', - filename=os.path.basename(attachment) - ) - logger.info(f'Setting smtp server {smtp_email_server}...') - smtp_server = smtplib.SMTP(smtp_email_server) - smtp_server.starttls() - smtp_server.login(smtp_user, smtp_pwd) - logger.info(f'smtp server authentication successful') - try: - logger.info(f'Sending email...') - if bcc_email: - # Send bcc list as an argument instead of adding it to the header to keep it hidden - bcc_list = bcc_email.split(",") - smtp_server.send_message(message, bcc=bcc_list) - else: - smtp_server.send_message(message) - logger.info(f'email sent.') - except Exception as ex: - raise ex - finally: - try: - smtp_server.quit() - except smtplib.SMTPServerDisconnected: - pass - finally: - logger.info("smtp connection is closed") - -def main(): - parser = argparse.ArgumentParser(description="Send an email with optional attachments") - parser.add_argument('--sender', required=True, help='Sender email address') - parser.add_argument('--to', required=True, help='Recipient email address(es) (comma-separated)') - parser.add_argument('--subject', required=True, help='Email subject') - parser.add_argument('--body', required=True, help='Email body') - parser.add_argument('--smtp-user', required=True, help='SMTP server username') - parser.add_argument('--smtp-pwd', required=True, help='SMTP server password') - parser.add_argument('--smtp-server', required=True, help='SMTP server address and port') - parser.add_argument('--cc', default='', help='CC email address(es) (comma-separated)') - parser.add_argument('--bcc', default='', help='BCC email address(es) (comma-separated)') - parser.add_argument('--reply-to', default='', help='Reply-To email address') - parser.add_argument('--html-body', action='store_true', help='Flag to indicate if email body is HTML') - parser.add_argument('--attachments', default='', help='Attachment file path(s) (space-separated)') - args = parser.parse_args() - - send_email( - sender_email=args.sender, - to_email=args.to, - subject=args.subject, - email_body=args.body, - smtp_user=args.smtp_user, - smtp_pwd=args.smtp_pwd, - smtp_email_server=args.smtp_server, - cc_email=args.cc, - bcc_email=args.bcc, - reply_email=args.reply_to, - is_html_body=args.html_body, - attachments=args.attachments - ) - -if __name__ == '__main__': - main() diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb deleted file mode 100644 index 59ae25d98a..0000000000 --- a/openfl-tutorials/experimental/workflow/LLM/phi-4-quanti.ipynb +++ /dev/null @@ -1,300 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "a59f475d-d843-46bc-b75e-10984b687ed3", - "metadata": {}, - "source": [ - "# Federated Fine-Tuning of Phi-4 with 8-bit Quantization" - ] - }, - { - "cell_type": "markdown", - "id": "20c74cb9-51a2-42e2-893f-d280e227e8bf", - "metadata": {}, - "source": [ - "This notebook demonstrates federated fine-tuning of Microsoft's Phi-4 model (4B parameters) with 8-bit quantization using OpenFL." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "05b2ad75-8c7b-499c-902e-dbd5b24361bc", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install torch transformers peft datasets trl==0.12.2 bitsandbytes accelerate -q" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "be4690ae-0671-4d3a-8f21-620ab865a03e", - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "import transformers\n", - "from transformers import (\n", - " AutoModelForCausalLM,\n", - " AutoTokenizer,\n", - " BitsAndBytesConfig,\n", - " TrainingArguments\n", - ")\n", - "from peft import LoraConfig, get_peft_model\n", - "from datasets import load_dataset\n", - "from trl import SFTTrainer\n", - "import numpy as np\n", - "from openfl.experimental.workflow.interface import Aggregator, Collaborator, FLSpec\n", - "from openfl.experimental.workflow.placement import aggregator, collaborator\n", - "from openfl.experimental.workflow.runtime import LocalRuntime" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d615d626-8727-4169-b2a6-3ba15c3cdb95", - "metadata": {}, - "outputs": [], - "source": [ - "# 8-bit quantization config for Phi-4\n", - "quant_config = BitsAndBytesConfig(\n", - " load_in_8bit=True,\n", - " llm_int8_threshold=6.0,\n", - " llm_int8_skip_modules=None,\n", - " llm_int8_enable_fp32_cpu_offload=False,\n", - " llm_int8_has_fp16_weight=False\n", - ")\n", - "\n", - "# Model config for Phi-4\n", - "model_kwargs = {\n", - " \"quantization_config\": quant_config,\n", - " \"device_map\": \"auto\",\n", - " \"trust_remote_code\": True,\n", - " \"torch_dtype\": torch.bfloat16\n", - "}\n", - "\n", - "# PEFT config optimized for Phi-4\n", - "peft_config = LoraConfig(\n", - " r=16, # Higher rank for larger model\n", - " lora_alpha=32,\n", - " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n", - " lora_dropout=0.05,\n", - " bias=\"none\",\n", - " task_type=\"CAUSAL_LM\"\n", - ")\n", - "\n", - "# Training config adjusted for Phi-4\n", - "training_config = TrainingArguments(\n", - " output_dir=\"./results\",\n", - " per_device_train_batch_size=1, # Reduced for 4B model\n", - " per_device_eval_batch_size=1,\n", - " gradient_accumulation_steps=8, # Increased for memory efficiency\n", - " learning_rate=1e-5, # Lower learning rate for larger model\n", - " logging_steps=10,\n", - " num_train_epochs=1,\n", - " max_grad_norm=0.3,\n", - " warmup_ratio=0.03,\n", - " lr_scheduler_type=\"cosine\",\n", - " save_steps=100,\n", - " bf16=True, # Using bfloat16 for Phi-4\n", - " optim=\"adamw_torch\",\n", - " report_to=\"none\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8ab371f1-64c3-4225-82e7-fb3c5b05578c", - "metadata": {}, - "outputs": [], - "source": [ - "# Load Phi-4 model and tokenizer\n", - "model = AutoModelForCausalLM.from_pretrained(\"microsoft/phi-4\", **model_kwargs)\n", - "model = get_peft_model(model, peft_config)\n", - "tokenizer = AutoTokenizer.from_pretrained(\"microsoft/phi-4\")\n", - "tokenizer.pad_token = tokenizer.eos_token\n", - "tokenizer.padding_side = \"left\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4392ddab-10b7-41f6-a8e0-65ba298ea457", - "metadata": {}, - "outputs": [], - "source": [ - "# Dataset preparation for Phi-4\n", - "def format_instruction(sample):\n", - " return f\"\"\"### Instruction:\n", - "{sample['instruction']}\n", - "\n", - "### Input:\n", - "{sample['input']}\n", - "\n", - "### Response:\n", - "{sample['output']}\"\"\"\n", - "\n", - "dataset = load_dataset(\"json\", data_files=\"math_10k.json\")[\"train\"].train_test_split(test_size=0.1)\n", - "train_data = dataset[\"train\"].shuffle().select(range(50)) # Smaller subset for Phi-4\n", - "val_data = dataset[\"test\"].shuffle().select(range(10))\n", - "\n", - "train_data = train_data.map(lambda x: {\"text\": format_instruction(x)})\n", - "val_data = val_data.map(lambda x: {\"text\": format_instruction(x)})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6dc85c57-68b2-4514-9373-43e3d7c05c10", - "metadata": {}, - "outputs": [], - "source": [ - "class FederatedFlow(FLSpec):\n", - " def __init__(self, model=None, rounds=3, **kwargs):\n", - " super().__init__(**kwargs)\n", - " self.model = model\n", - " self.rounds = rounds\n", - " self.training_metrics = []\n", - " \n", - " @aggregator\n", - " def start(self):\n", - " print(\"Starting federated training for Phi-4\")\n", - " self.collaborators = self.runtime.collaborators\n", - " self.current_round = 0\n", - " self.next(self.aggregated_model_validation, foreach=\"collaborators\")\n", - " \n", - " @collaborator\n", - " def aggregated_model_validation(self):\n", - " trainer = SFTTrainer(\n", - " model=self.model,\n", - " args=training_config,\n", - " train_dataset=self.train_data,\n", - " eval_dataset=self.val_data,\n", - " dataset_text_field=\"text\",\n", - " max_seq_length=512,\n", - " tokenizer=tokenizer\n", - " )\n", - " metrics = trainer.evaluate()\n", - " self.validation_loss = metrics[\"eval_loss\"]\n", - " self.next(self.train)\n", - " \n", - " @collaborator\n", - " def train(self):\n", - " trainer = SFTTrainer(\n", - " model=self.model,\n", - " args=training_config,\n", - " train_dataset=self.train_data,\n", - " eval_dataset=self.val_data,\n", - " dataset_text_field=\"text\",\n", - " max_seq_length=512,\n", - " tokenizer=tokenizer\n", - " )\n", - " train_result = trainer.train()\n", - " self.training_loss = train_result.training_loss\n", - " self.training_metrics.append({\n", - " \"round\": self.current_round,\n", - " \"loss\": self.training_loss,\n", - " \"collaborator\": self.input\n", - " })\n", - " self.next(self.local_model_validation)\n", - " \n", - " @collaborator\n", - " def local_model_validation(self):\n", - " trainer = SFTTrainer(\n", - " model=self.model,\n", - " args=training_config,\n", - " train_dataset=self.train_data,\n", - " eval_dataset=self.val_data,\n", - " dataset_text_field=\"text\",\n", - " max_seq_length=512,\n", - " tokenizer=tokenizer\n", - " )\n", - " metrics = trainer.evaluate()\n", - " self.local_validation_loss = metrics[\"eval_loss\"]\n", - " self.next(self.join, exclude=[\"model\"])\n", - " \n", - " @aggregator\n", - " def join(self, inputs):\n", - " avg_loss = sum(input.training_loss for input in inputs) / len(inputs)\n", - " avg_val_loss = sum(input.validation_loss for input in inputs) / len(inputs)\n", - " \n", - " print(f\"Round {self.current_round} - Avg Training Loss: {avg_loss:.4f}\")\n", - " print(f\"Round {self.current_round} - Avg Validation Loss: {avg_val_loss:.4f}\")\n", - " \n", - " # Aggregate model updates\n", - " self.current_round += 1\n", - " if self.current_round < self.rounds:\n", - " self.next(self.aggregated_model_validation, foreach=\"collaborators\")\n", - " else:\n", - " self.next(self.end)\n", - " \n", - " @aggregator\n", - " def end(self):\n", - " print(\"Phi-4 Training complete!\")\n", - " print(\"Final Training Metrics:\")\n", - " for metric in self.training_metrics:\n", - " print(f\"Round {metric['round']} - {metric['collaborator']} - Loss: {metric['loss']:.4f}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e5e108c6-5150-4931-9c01-6b64a913fa04", - "metadata": {}, - "outputs": [], - "source": [ - "# Setup runtime for Phi-4\n", - "aggregator = Aggregator()\n", - "collaborators = [\n", - " Collaborator(name=\"Portland\", private_attributes={\"train_data\": train_data.shard(2, 0), \"val_data\": val_data.shard(2, 0)}),\n", - " Collaborator(name=\"Seattle\", private_attributes={\"train_data\": train_data.shard(2, 1), \"val_data\": val_data.shard(2, 1)})\n", - "]\n", - "\n", - "runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators, backend=\"single_process\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "38894111-41d9-4dd4-b1c8-eb7ec3cdd3e1", - "metadata": {}, - "outputs": [], - "source": [ - "# Run training for Phi-4\n", - "flow = FederatedFlow(model, rounds=2)\n", - "flow.runtime = runtime\n", - "flow.run()" - ] - }, - { - "cell_type": "markdown", - "id": "7bc8fe27", - "metadata": {}, - "source": [ - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-sol.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-sol.ipynb deleted file mode 100644 index b1ca1bac6e..0000000000 --- a/openfl-tutorials/experimental/workflow/LLM/phi-4-sol.ipynb +++ /dev/null @@ -1,438 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "a59f475d-d843-46bc-b75e-10984b687ed3", - "metadata": {}, - "source": [ - "# Enhanced Federated Fine-Tuning of Phi-4 Using OpenFL with PEFT & Quantization" - ] - }, - { - "cell_type": "markdown", - "id": "20c74cb9-51a2-42e2-893f-d280e227e8bf", - "metadata": {}, - "source": [ - "In this tutorial, we demonstrate how to fine-tune Microsoft's Phi-4 model in a federated learning workflow with enhanced local training using:\n", - "- Parameter-Efficient Fine-Tuning (PEFT)\n", - "- 4-bit Quantization (QLoRA)\n", - "- Gradient Checkpointing\n", - "- Optimized Training Configuration" - ] - }, - { - "cell_type": "markdown", - "id": "d07c32d3-1a8d-4162-af45-bc3a10e0ae3f", - "metadata": {}, - "source": [ - "## Installation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "05b2ad75-8c7b-499c-902e-dbd5b24361bc", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install torch transformers peft datasets trl==0.12.2 bitsandbytes accelerate -q" - ] - }, - { - "cell_type": "markdown", - "id": "440a9c39-ec42-45a5-80f6-9a9e0bc90d2f", - "metadata": {}, - "source": [ - "## Import Libraries" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "be4690ae-0671-4d3a-8f21-620ab865a03e", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import torch\n", - "from transformers import (\n", - " AutoModelForCausalLM,\n", - " AutoTokenizer,\n", - " BitsAndBytesConfig,\n", - " TrainingArguments\n", - ")\n", - "from peft import (\n", - " LoraConfig,\n", - " get_peft_model,\n", - " prepare_model_for_kbit_training,\n", - " PeftModel\n", - ")\n", - "from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict # Added this import\n", - "from datasets import load_dataset\n", - "from trl import SFTTrainer\n", - "from openfl.experimental.workflow import FLSpec, Aggregator, Collaborator, LocalRuntime\n", - "import numpy as np" - ] - }, - { - "cell_type": "markdown", - "id": "08576aa0-f628-4ae6-8fc3-dd167d164784", - "metadata": {}, - "source": [ - "## Configuration" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eada9809-468a-47c6-9b03-55aa887c9487", - "metadata": {}, - "outputs": [], - "source": [ - "# Model and dataset\n", - "model_name = \"microsoft/phi-4\"\n", - "dataset_name = \"math_10k.json\"\n", - "\n", - "# QLoRA configuration\n", - "bnb_config = BitsAndBytesConfig(\n", - " load_in_4bit=True,\n", - " bnb_4bit_quant_type=\"nf4\",\n", - " bnb_4bit_compute_dtype=torch.bfloat16,\n", - " bnb_4bit_use_double_quant=True,\n", - ")\n", - "\n", - "# LoRA configuration\n", - "peft_config = LoraConfig(\n", - " r=16, # Increased from original for better adaptation\n", - " lora_alpha=32,\n", - " lora_dropout=0.05,\n", - " bias=\"none\",\n", - " task_type=\"CAUSAL_LM\",\n", - " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"dense\"],\n", - ")\n", - "\n", - "# Training configuration\n", - "training_args = TrainingArguments(\n", - " output_dir=\"./results\",\n", - " num_train_epochs=1,\n", - " per_device_train_batch_size=1, # Reduced for Phi-4\n", - " gradient_accumulation_steps=2,\n", - " optim=\"paged_adamw_32bit\",\n", - " save_steps=100,\n", - " logging_steps=10,\n", - " learning_rate=2e-4,\n", - " weight_decay=0.001,\n", - " fp16=False,\n", - " bf16=True,\n", - " max_grad_norm=0.3,\n", - " warmup_ratio=0.03,\n", - " lr_scheduler_type=\"cosine\",\n", - " gradient_checkpointing=True,\n", - " report_to=\"none\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "ffe93234-2a1a-4809-a431-efe2f35ce496", - "metadata": {}, - "source": [ - "## Load and Prepare Model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8ab371f1-64c3-4225-82e7-fb3c5b05578c", - "metadata": {}, - "outputs": [], - "source": [ - "# Load tokenizer\n", - "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n", - "tokenizer.pad_token = tokenizer.eos_token\n", - "tokenizer.padding_side = \"right\"\n", - "\n", - "# Load model with quantization\n", - "model = AutoModelForCausalLM.from_pretrained(\n", - " model_name,\n", - " quantization_config=bnb_config,\n", - " device_map=\"auto\",\n", - " trust_remote_code=True\n", - ")\n", - "\n", - "# Prepare model for k-bit training\n", - "model = prepare_model_for_kbit_training(model)\n", - "\n", - "# Apply LoRA\n", - "model = get_peft_model(model, peft_config)\n", - "model.print_trainable_parameters()" - ] - }, - { - "cell_type": "markdown", - "id": "dd058fff-f6dd-4cc6-acaf-7e2fa2c1132d", - "metadata": {}, - "source": [ - "## Load and Prepare Dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4392ddab-10b7-41f6-a8e0-65ba298ea457", - "metadata": {}, - "outputs": [], - "source": [ - "def format_prompt(example):\n", - " if example[\"input\"]:\n", - " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n", - "\n", - "### Instruction:\n", - "{example['instruction']}\n", - "\n", - "### Input:\n", - "{example['input']}\n", - "\n", - "### Response:\n", - "{example['output']}\"\"\"\n", - " else:\n", - " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n", - "\n", - "### Instruction:\n", - "{example['instruction']}\n", - "\n", - "### Response:\n", - "{example['output']}\"\"\"\n", - "\n", - "# Load dataset\n", - "dataset = load_dataset(\"json\", data_files=dataset_name, split=\"train\")\n", - "dataset = dataset.map(lambda x: {\"text\": format_prompt(x)})\n", - "\n", - "# Split dataset\n", - "dataset = dataset.train_test_split(test_size=0.1)\n", - "train_dataset = dataset[\"train\"]\n", - "eval_dataset = dataset[\"test\"]" - ] - }, - { - "cell_type": "markdown", - "id": "812cfcc8-33ec-4a2b-8a74-27bfc2a41d7b", - "metadata": {}, - "source": [ - "## Enhanced Training with SFTTrainer" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6dc85c57-68b2-4514-9373-43e3d7c05c10", - "metadata": {}, - "outputs": [], - "source": [ - "trainer = SFTTrainer(\n", - " model=model,\n", - " train_dataset=train_dataset,\n", - " eval_dataset=eval_dataset,\n", - " peft_config=peft_config,\n", - " dataset_text_field=\"text\",\n", - " max_seq_length=1024,\n", - " tokenizer=tokenizer,\n", - " args=training_args,\n", - " packing=True,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "810eb75e", - "metadata": {}, - "source": [ - "## Federated Averaging Function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "58298e8e-ab9e-4377-966e-143823441697", - "metadata": {}, - "outputs": [], - "source": [ - "def FedAvg(peft_params, model, weights=None):\n", - " \"\"\"\n", - " Perform Federated Averaging (FedAvg) on the model parameters.\n", - " \"\"\"\n", - " state_dicts = peft_params\n", - " state_dict = get_peft_model_state_dict(model)\n", - " for key in peft_params[0]:\n", - " dtype = state_dicts[0][key].dtype\n", - " state_dict[key] = torch.from_numpy(\n", - " np.average(\n", - " [state[key].to(torch.float).numpy() for state in state_dicts], \n", - " axis=0, \n", - " weights=weights\n", - " )\n", - " ).to(dtype)\n", - " set_peft_model_state_dict(model, state_dict)\n", - " return model" - ] - }, - { - "cell_type": "markdown", - "id": "e120a656-f4a5-47a5-a3d4-62c5f3672bba", - "metadata": {}, - "source": [ - "## Federated Learning Workflow" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e5e108c6-5150-4931-9c01-6b64a913fa04", - "metadata": {}, - "outputs": [], - "source": [ - "class FederatedFlow(FLSpec):\n", - " def __init__(self, model=None, rounds=3, **kwargs):\n", - " super().__init__(**kwargs)\n", - " if model is not None:\n", - " self.model = model\n", - " self.peft_params = get_peft_model_state_dict(self.model)\n", - " else:\n", - " raise ValueError(\"No model provided\")\n", - " \n", - " self.rounds = rounds\n", - " \n", - " @aggregator\n", - " def start(self):\n", - " print(\"Initializing federated learning\")\n", - " self.collaborators = self.runtime.collaborators\n", - " self.current_round = 0\n", - " self.next(self.aggregated_model_validation, foreach=\"collaborators\")\n", - " \n", - " @collaborator\n", - " def aggregated_model_validation(self):\n", - " print(f\"Validating aggregated model for {self.input}\")\n", - " # Load model with quantization\n", - " self.model = AutoModelForCausalLM.from_pretrained(\n", - " model_name,\n", - " quantization_config=bnb_config,\n", - " device_map=\"auto\",\n", - " trust_remote_code=True\n", - " )\n", - " self.model = prepare_model_for_kbit_training(self.model)\n", - " self.model = get_peft_model(self.model, peft_config)\n", - " set_peft_model_state_dict(self.model, self.peft_params)\n", - " \n", - " # Evaluate\n", - " eval_results = trainer.evaluate()\n", - " self.agg_validation_score = eval_results[\"eval_loss\"]\n", - " print(f\"Validation loss: {self.agg_validation_score}\")\n", - " self.next(self.train)\n", - " \n", - " @collaborator\n", - " def train(self):\n", - " print(f\"Training on {self.input}\")\n", - " # Train with local data\n", - " trainer.train()\n", - " self.loss = trainer.state.log_history[-1][\"loss\"]\n", - " self.next(self.local_model_validation)\n", - " \n", - " @collaborator\n", - " def local_model_validation(self):\n", - " print(f\"Validating local model for {self.input}\")\n", - " eval_results = trainer.evaluate()\n", - " self.local_validation_score = eval_results[\"eval_loss\"]\n", - " self.peft_params = get_peft_model_state_dict(self.model)\n", - " self.next(self.join, exclude=[\"model\"])\n", - " \n", - " @aggregator\n", - " def join(self, inputs):\n", - " self.average_loss = sum(input.loss for input in inputs) / len(inputs)\n", - " self.aggregated_model_accuracy = sum(\n", - " input.agg_validation_score for input in inputs\n", - " ) / len(inputs)\n", - " self.local_model_accuracy = sum(\n", - " input.local_validation_score for input in inputs\n", - " ) / len(inputs)\n", - " \n", - " print(f\"Round {self.current_round + 1} results:\")\n", - " print(f\"Average training loss: {self.average_loss}\")\n", - " print(f\"Average validation loss (before training): {self.aggregated_model_accuracy}\")\n", - " print(f\"Average validation loss (after training): {self.local_model_accuracy}\")\n", - " \n", - " # Federated averaging\n", - " self.model = FedAvg([input.peft_params for input in inputs], self.model)\n", - " self.peft_params = get_peft_model_state_dict(self.model)\n", - " \n", - " self.current_round += 1\n", - " if self.current_round < self.rounds:\n", - " self.next(self.aggregated_model_validation, foreach=\"collaborators\")\n", - " else:\n", - " self.next(self.end)\n", - " \n", - " @aggregator\n", - " def end(self):\n", - " print(\"Federated training complete!\")\n", - " print(f\"Final model validation loss: {self.aggregated_model_accuracy}\")" - ] - }, - { - "cell_type": "markdown", - "id": "7bc8fe27", - "metadata": {}, - "source": [ - "## Run Federated Learning" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "38894111-41d9-4dd4-b1c8-eb7ec3cdd3e1", - "metadata": {}, - "outputs": [], - "source": [ - "# Setup participants\n", - "aggregator = Aggregator()\n", - "collaborators = [\n", - " Collaborator(name=\"Portland\"),\n", - " Collaborator(name=\"Seattle\"),\n", - " Collaborator(name=\"London\")\n", - "]\n", - "\n", - "# Assign data shards\n", - "for idx, colab in enumerate(collaborators):\n", - " colab.private_attributes = {\n", - " \"train_dataset\": train_dataset.shard(len(collaborators), idx),\n", - " \"eval_dataset\": eval_dataset.shard(len(collaborators), idx)\n", - " }\n", - "\n", - "# Create and run workflow\n", - "runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators)\n", - "flflow = FederatedFlow(model, rounds=3)\n", - "flflow.runtime = runtime\n", - "flflow.run()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 5321986b45a7ab381a8e9851725360203bc49dc0 Mon Sep 17 00:00:00 2001 From: Rajith Date: Fri, 16 May 2025 11:12:37 +0530 Subject: [PATCH 28/34] reverting local changes --- .github/workflows/trivy.yml | 243 +++++++++++++----------------------- CODEOWNERS | 2 +- 2 files changed, 85 insertions(+), 160 deletions(-) diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml index 7c9b79761a..62b196f3d1 100644 --- a/.github/workflows/trivy.yml +++ b/.github/workflows/trivy.yml @@ -1,4 +1,4 @@ -name: Trivy Nightly Security Scan +name: Trivy Nightly Scan on: workflow_call: inputs: @@ -7,192 +7,117 @@ on: type: string workflow_dispatch: schedule: - - cron: '0 0 * * *' # Runs daily at midnight UTC + - cron: '0 0 * * *' # This runs the workflow every night at midnight UTC jobs: - security-scan: + build: if: github.event.pull_request.draft == false permissions: - contents: read - security-events: write - actions: read - packages: read - issues: write + contents: read # for actions/checkout to fetch code + security-events: write # for github/codeql-action/upload-sarif to upload SARIF results + actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status runs-on: ubuntu-22.04 - timeout-minutes: 45 + timeout-minutes: 15 + env: - TRIVY_VERSION: 0.50.1 + TRIVY_DB_REPOSITORY: 'ghcr.io/aquasecurity/trivy-db,public.ecr.aws/aquasecurity/trivy-db' COMMIT_ID: ${{ inputs.commit_id || github.sha }} steps: - # ============ SETUP PHASE ============ - - name: Checkout repository + - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ env.COMMIT_ID }} - fetch-depth: 0 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.10' + - name: Install Trivy + run: | + curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sudo sh -s -- -b /usr/local/bin - # ============ SCANNING PHASE ============ - - name: Run filesystem scan - uses: aquasecurity/trivy-action@0.30.0 + - name: Run Trivy code vulnerability scanner (JSON Output) + run: | + trivy --quiet fs \ + --format json \ + --output trivy-code-results.json \ + --ignore-unfixed \ + --vuln-type os,library \ + --severity CRITICAL,HIGH,MEDIUM \ + . + + - name: Display Trivy code Scan Results + if: failure() # Ensure this step runs regardless of the previous step's outcome + run: | + echo "Trivy Scan Results:" + cat trivy-code-results.json | jq '.Results[] | select(.Vulnerabilities != null) | .Vulnerabilities[] | {VulnerabilityID, PkgName, InstalledVersion, Severity, Description}' + + - name: Upload Code Vulnerability Scan Results + uses: actions/upload-artifact@v4 with: - scan-type: 'fs' - format: 'json' - output: 'trivy-fs-report.json' - severity: 'CRITICAL,HIGH' - ignore-unfixed: true - vuln-type: 'os,library' - security-checks: 'vuln' + name: trivy-code-report-json + path: trivy-code-results.json - - name: Build Docker image + - name: Build an image from Dockerfile run: | - docker buildx build \ - --pull \ - --tag local/scan-target:${{ github.run_id }} \ - --file openfl-docker/Dockerfile.base \ - --load \ - . + docker build --pull -t docker.io/securefederatedai/openfl:${{ github.sha }} -f openfl-docker/Dockerfile.base . - - name: Scan Docker image + - name: Run Trivy vulnerability scanner for Docker image (JSON Output) + id: trivy-scan uses: aquasecurity/trivy-action@0.30.0 with: - image-ref: 'local/scan-target:${{ github.run_id }}' + image-ref: 'docker.io/securefederatedai/openfl:${{ github.sha }}' format: 'json' - output: 'trivy-image-report.json' - severity: 'CRITICAL,HIGH' + output: 'trivy-docker-results.json' + exit-code: '1' ignore-unfixed: true vuln-type: 'os,library' - security-checks: 'vuln' - - # ============ REPORTING PHASE ============ - - name: Generate SBOM reports - run: | - trivy fs --format spdx-json --output trivy-fs-sbom.json . - trivy image --format spdx-json --output trivy-image-sbom.json local/scan-target:${{ github.run_id }} + severity: 'CRITICAL,HIGH,MEDIUM' + trivyignores: '.trivyignore' - - name: Create consolidated report - id: report + - name: Display Trivy Docker Scan Results + if: failure() # Ensure this step runs regardless of the previous step's outcome run: | - # Initialize markdown report - echo "# Security Scan Report - OpenFL" > report.md - echo "**Scan Date:** $(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> report.md - echo "**Commit:** [${{ env.COMMIT_ID }}](https://github.com/rajithkrishnegowda/openfl/commit/${{ env.COMMIT_ID }})" >> report.md - echo -e "\n## Vulnerability Summary\n" >> report.md - - # Process filesystem results - if [ -f "trivy-fs-report.json" ]; then - FS_VULNS=$(jq '[.Results[]?.Vulnerabilities[]?] | length' trivy-fs-report.json || echo 0) - echo "### Filesystem Scans" >> report.md - echo "**Critical/High Vulnerabilities:** $FS_VULNS" >> report.md - - if [ "$FS_VULNS" -gt 0 ]; then - echo -e "\n| Severity | ID | Package | Version | Description |" >> report.md - echo "|----------|----|---------|---------|-------------|" >> report.md - jq -r '.Results[]?.Vulnerabilities[]? | "| \(.Severity) | \(.VulnerabilityID) | \(.PkgName) | \(.InstalledVersion) | \(.Title) |"' trivy-fs-report.json >> report.md - echo "has_vulnerabilities=true" >> $GITHUB_OUTPUT - fi - fi - - # Process image results - if [ -f "trivy-image-report.json" ]; then - IMG_VULNS=$(jq '[.Results[]?.Vulnerabilities[]?] | length' trivy-image-report.json || echo 0) - echo -e "\n### Container Image Scans" >> report.md - echo "**Critical/High Vulnerabilities:** $IMG_VULNS" >> report.md - - if [ "$IMG_VULNS" -gt 0 ]; then - echo -e "\n| Severity | ID | Package | Version | Description |" >> report.md - echo "|----------|----|---------|---------|-------------|" >> report.md - jq -r '.Results[]?.Vulnerabilities[]? | "| \(.Severity) | \(.VulnerabilityID) | \(.PkgName) | \(.InstalledVersion) | \(.Title) |"' trivy-image-report.json >> report.md - echo "has_vulnerabilities=true" >> $GITHUB_OUTPUT - fi - fi - - # Add artifact download links - echo -e "\n## Next Steps\n" >> report.md - echo "1. Review the full reports in the workflow artifacts" >> report.md - echo "2. Address critical vulnerabilities immediately" >> report.md - echo "3. Create GitHub issues for tracking remediation" >> report.md - - cat report.md - - # ============ NOTIFICATION PHASE ============ - - name: Set notification subject - id: set-subject - run: | - if [[ "${{ job.status }}" == "failure" ]]; then - echo "subject=🚨 OpenFL Security Scan Failed" >> $GITHUB_OUTPUT - elif [[ "${{ steps.report.outputs.has_vulnerabilities }}" == "true" ]]; then - echo "subject=⚠️ OpenFL Vulnerabilities Found" >> $GITHUB_OUTPUT + if [ -s trivy-docker-results.json ]; then + echo "Trivy Scan Results:" + cat trivy-docker-results.json | jq '.Results[] | select(.Vulnerabilities != null) | .Vulnerabilities[] | {VulnerabilityID, PkgName, InstalledVersion, Severity, Description}' else - echo "subject=✅ OpenFL Security Scan Passed" >> $GITHUB_OUTPUT + echo "Trivy scan results file is empty or not found." fi - - - name: Extract CODEOWNERS emails - id: codeowners - run: | - if ! command -v python &> /dev/null; then - sudo apt-get update && sudo apt-get install -y python3 - fi - - OUTPUT=$(python .github/scripts/extract_emails.py) - echo "Extracted emails: $OUTPUT" - - EMAILS=$(echo "$OUTPUT" | jq -r '.emails | join(",")') - echo "emails=${EMAILS:-${{ secrets.SECURITY_EMAIL_RECIPIENTS }}}" >> $GITHUB_OUTPUT - - env: - PYTHONIOENCODING: utf-8 - - - name: Prepare email content - id: prepare-email - run: | - # Convert markdown to HTML - python -m pip install markdown - HTML_CONTENT=$(python -c "import markdown; print(markdown.markdown(open('report.md').read()))") - echo "html_body<> $GITHUB_OUTPUT - echo "$HTML_CONTENT" >> $GITHUB_OUTPUT - echo "EOF" >> $GITHUB_OUTPUT - - - name: Send email via Python script - if: always() && (steps.report.outputs.has_vulnerabilities == 'true' || failure()) - env: - SMTP_SERVER: ${{ secrets.SMTP_SERVER }} - SMTP_PORT: ${{ secrets.SMTP_PORT }} - SMTP_USER: ${{ secrets.SMTP_USER }} - SMTP_PASSWORD: ${{ secrets.SMTP_PASSWORD }} - RECIPIENTS: ${{ steps.codeowners.outputs.emails }} - run: | - python .github/scripts/send_email.py \ - --sender "security@openfl.github" \ - --to "$RECIPIENTS" \ - --subject "${{ steps.set-subject.outputs.subject }}" \ - --body "${{ steps.prepare-email.outputs.html_body }}" \ - --smtp-user "$SMTP_USER" \ - --smtp-pwd "$SMTP_PASSWORD" \ - --smtp-server "$SMTP_SERVER:$SMTP_PORT" \ - --html-body - - # ============ ARTIFACT UPLOADS ============ - - name: Upload scan artifacts + + - name: Upload final Trivy Docker Vulnerability Scan uses: actions/upload-artifact@v4 with: - name: security-reports-${{ github.run_id }} - path: | - trivy-fs-report.json - trivy-image-report.json - trivy-fs-sbom.json - trivy-image-sbom.json - report.md - retention-days: 30 + name: trivy-docker-report-json + path: trivy-docker-results.json - # ============ FAILURE HANDLING ============ - - name: Fail workflow if vulnerabilities found - if: steps.report.outputs.has_vulnerabilities == 'true' && github.event_name != 'schedule' + - name: Run Trivy code vulnerability scanner (SPDX-JSON Output) run: | - echo "::error::Critical/High vulnerabilities detected!" - exit 1 + trivy --quiet fs \ + --format spdx-json \ + --output trivy-code-spdx-results.json \ + --ignore-unfixed \ + --vuln-type os,library \ + --severity CRITICAL,HIGH,MEDIUM \ + . + + - name: Upload SPDX Code Vulnerability Scan Results + uses: actions/upload-artifact@v4 + with: + name: trivy-code-spdx-report-json + path: trivy-code-spdx-results.json + + - name: Run Trivy vulnerability scanner for Docker image (SPDX-JSON Output) + uses: aquasecurity/trivy-action@0.30.0 + with: + image-ref: 'docker.io/securefederatedai/openfl:${{ github.sha }}' + format: 'spdx-json' + output: 'trivy-docker-spdx-results.json' + exit-code: '1' + ignore-unfixed: true + vuln-type: 'os,library' + severity: 'CRITICAL,HIGH,MEDIUM' + trivyignores: '.trivyignore' + + - name: Upload SPDX Docker Vulnerability Scan + uses: actions/upload-artifact@v4 + with: + name: trivy-docker-spdx-report-json + path: trivy-docker-spdx-results.json diff --git a/CODEOWNERS b/CODEOWNERS index b36cb38d58..cb0b89fc6b 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -30,6 +30,6 @@ /scripts/ aayush.garg@intel.com giribabu.bikki@intel.com karan.shah@intel.com patrick.foley@intel.com srikanth.enugula@intel.com teodor.parvanov@intel.com # File level ownership -CODEOWNERS akshay.pant@intel.com karan.shah@intel.com kevin.ta@intel.com noopur@intel.com patrick.foley@intel.com payal.chaurasiya@intel.com rahul.garg@intel.com rajith.krishnegowda@intel.com shailesh.pant@intel.com shailesh.tanwar@intel.com teodor.parvanov@intel.com +CODEOWNERS aayush.garg@intel.com giribabu.bikki@intel.com patrick.foley@intel.com preethi.asokan@intel.com rahul.garg@intel.com srikanth.enugula@intel.com teodor.parvanov@intel.com test-requirements.txt akshay.pant@intel.com karan.shah@intel.com kevin.ta@intel.com noopur@intel.com patrick.foley@intel.com payal.chaurasiya@intel.com rahul.garg@intel.com rajith.krishnegowda@intel.com shailesh.pant@intel.com shailesh.tanwar@intel.com teodor.parvanov@intel.com From ce8dbb916fa8c63e4b1385007d0d3bc599a36b74 Mon Sep 17 00:00:00 2001 From: Rajith Date: Fri, 16 May 2025 15:44:15 +0530 Subject: [PATCH 29/34] added 4bit and 8bit --- .../workflow/LLM/phi-4-withquantization.ipynb | 4167 +++++++++++++++++ 1 file changed, 4167 insertions(+) create mode 100644 openfl-tutorials/experimental/workflow/LLM/phi-4-withquantization.ipynb diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-withquantization.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-withquantization.ipynb new file mode 100644 index 0000000000..8a33373f91 --- /dev/null +++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-withquantization.ipynb @@ -0,0 +1,4167 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a59f475d-d843-46bc-b75e-10984b687ed3", + "metadata": {}, + "source": [ + "# Enhanced Federated Fine-Tuning of Phi-4 Using OpenFL with PEFT & Quantization" + ] + }, + { + "cell_type": "markdown", + "id": "20c74cb9-51a2-42e2-893f-d280e227e8bf", + "metadata": {}, + "source": [ + "In this tutorial, we demonstrate how to fine-tune Microsoft's Phi-4 model in a federated learning workflow with enhanced local training using:\n", + "- Parameter-Efficient Fine-Tuning (PEFT)\n", + "- 4-bit Quantization (QLoRA)\n", + "- Gradient Checkpointing\n", + "- Optimized Training Configuration" + ] + }, + { + "cell_type": "markdown", + "id": "d07c32d3-1a8d-4162-af45-bc3a10e0ae3f", + "metadata": {}, + "source": [ + "## Installation" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "05b2ad75-8c7b-499c-902e-dbd5b24361bc", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install torch transformers peft datasets trl==0.12.2 bitsandbytes accelerate -q" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a7ae1a7e-8c16-4c5a-be57-33d84723aed7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fri May 16 07:23:10 2025 \n", + "+-----------------------------------------------------------------------------------------+\n", + "| NVIDIA-SMI 550.144.03 Driver Version: 550.144.03 CUDA Version: 12.4 |\n", + "|-----------------------------------------+------------------------+----------------------+\n", + "| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n", + "| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n", + "| | | MIG M. |\n", + "|=========================================+========================+======================|\n", + "| 0 NVIDIA H100 NVL Off | 00000001:00:00.0 Off | 0 |\n", + "| N/A 41C P0 66W / 400W | 1MiB / 95830MiB | 0% Default |\n", + "| | | Disabled |\n", + "+-----------------------------------------+------------------------+----------------------+\n", + " \n", + "+-----------------------------------------------------------------------------------------+\n", + "| Processes: |\n", + "| GPU GI CI PID Type Process name GPU Memory |\n", + "| ID ID Usage |\n", + "|=========================================================================================|\n", + "| No running processes found |\n", + "+-----------------------------------------------------------------------------------------+\n" + ] + } + ], + "source": [ + "!nvidia-smi" + ] + }, + { + "cell_type": "markdown", + "id": "440a9c39-ec42-45a5-80f6-9a9e0bc90d2f", + "metadata": {}, + "source": [ + "## Import Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "be4690ae-0671-4d3a-8f21-620ab865a03e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/azureuser/env_name/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "2025-05-16 07:23:13,756\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n" + ] + } + ], + "source": [ + "import os\n", + "import torch\n", + "from transformers import (\n", + " AutoModelForCausalLM,\n", + " AutoTokenizer,\n", + " BitsAndBytesConfig,\n", + " TrainingArguments\n", + ")\n", + "from peft import (\n", + " LoraConfig,\n", + " get_peft_model,\n", + " prepare_model_for_kbit_training,\n", + " PeftModel\n", + ")\n", + "from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict # Added this import\n", + "from datasets import load_dataset\n", + "from trl import SFTTrainer\n", + "from openfl.experimental.workflow.interface import Aggregator, Collaborator, FLSpec\n", + "from openfl.experimental.workflow.placement import aggregator, collaborator\n", + "from openfl.experimental.workflow.runtime import LocalRuntime\n", + "import numpy as np\n", + "from transformers.trainer_callback import PrinterCallback\n", + "import transformers\n", + "import gc\n", + "import psutil" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74fed8f2", + "metadata": {}, + "outputs": [], + "source": [ + "# Memory optimization setup\n", + "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\" # Enable dynamic memory allocation\n", + "os.environ[\"TRANSFORMERS_ATTN_IMPLEMENTATION\"] = \"flash_attention_2\" # Use optimized attention\n", + "\n", + "def clear_gpu():\n", + " torch.cuda.empty_cache()\n", + " gc.collect()\n", + "\n", + "clear_gpu()" + ] + }, + { + "cell_type": "markdown", + "id": "813b4917", + "metadata": {}, + "source": [ + "## Acquiring and preprocessing dataset\n", + "\n", + "We can download the dataset directly from the [LLM-Adapters repository](https://github.com/AGI-Edgerunners/LLM-Adapters)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6df7bfb4", + "metadata": {}, + "outputs": [], + "source": [ + "# Import libraries needed for downloading and verifying the dataset\n", + "import hashlib\n", + "import requests\n", + "\n", + "def file_checksum(file_path, algorithm=\"sha256\"):\n", + " \"\"\"\n", + " Calculate the checksum of a file using the specified hashing algorithm.\n", + " \n", + " Args:\n", + " file_path (str): The path to the file for which the checksum is to be calculated.\n", + " algorithm (str): The hashing algorithm to use (default is 'sha256').\n", + " \n", + " Returns:\n", + " str: The calculated checksum of the file.\n", + " \"\"\"\n", + " hash_func = hashlib.new(algorithm)\n", + " with open(file_path, \"rb\") as f:\n", + " for chunk in iter(lambda: f.read(4096), b\"\"):\n", + " hash_func.update(chunk)\n", + " return hash_func.hexdigest()\n", + "\n", + "\n", + "# Download the dataset if it doesn't exist locally\n", + "if not os.path.exists(\"math_10k.json\"):\n", + " print(\"Downloading math_10k.json dataset...\")\n", + " r = requests.get(\n", + " \"https://raw.githubusercontent.com/AGI-Edgerunners/LLM-Adapters/main/ft-training_set/math_10k.json\",\n", + " )\n", + " with open(\n", + " \"math_10k.json\",\n", + " \"wb\",\n", + " ) as f:\n", + " f.write(r.content)\n", + " print(\"Download complete.\")\n", + "\n", + " # Verify the integrity of the downloaded file\n", + " actual_checksum = file_checksum(\"math_10k.json\")\n", + " expected_checksum = \"0342d0d860ad8592b579329337c90e42eefd3d9f2898043140cbd120630418b8\"\n", + " if actual_checksum != expected_checksum:\n", + " raise ValueError(\n", + " \"Checksum verification failed. The file may have been altered.\"\n", + " )\n", + " print(\"Checksum verification successful.\")\n", + "else:\n", + " print(\"Dataset already exists locally.\")\n", + "\n", + "# Set the dataset path to be used later\n", + "dataset_name = \"math_10k.json\"" + ] + }, + { + "cell_type": "markdown", + "id": "08576aa0-f628-4ae6-8fc3-dd167d164784", + "metadata": {}, + "source": [ + "## Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eada9809-468a-47c6-9b03-55aa887c9487", + "metadata": {}, + "outputs": [], + "source": [ + "# Model and dataset\n", + "model_name = \"microsoft/phi-4\"\n", + "#dataset_name = \"math_10k.json\"\n", + "\n", + "# 4-bit QLoRA configuration\n", + "bnb_config_4bit = BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_quant_type=\"nf4\",\n", + " bnb_4bit_compute_dtype=torch.bfloat16,\n", + " bnb_4bit_use_double_quant=False,\n", + ")\n", + "\n", + "# 8-bit QLoRA configuration\n", + "bnb_config_8bit = BitsAndBytesConfig(\n", + " load_in_8bit=True,\n", + " llm_int8_enable_fp32_cpu_offload=True,\n", + " llm_int8_skip_modules=['lm_head'],\n", + " llm_int8_threshold=6.0,\n", + " llm_int8_has_fp16_weight=False,\n", + ")\n", + "\n", + "# Active quantization config (will be set to either 4-bit or 8-bit)\n", + "bnb_config = bnb_config_4bit # Default to 4-bit\n", + "\n", + "# LoRA configuration\n", + "peft_config = LoraConfig(\n", + " r=8, # Increased from original for better adaptation\n", + " lora_alpha=16,\n", + " lora_dropout=0.01,\n", + " bias=\"none\",\n", + " task_type=\"CAUSAL_LM\",\n", + " target_modules=\"all-linear\",\n", + ")\n", + "\n", + "# Training configuration\n", + "training_args = TrainingArguments(\n", + " output_dir=\"./results\",\n", + " num_train_epochs=1,\n", + " per_device_train_batch_size=2, # Reduced for Phi-4\n", + " gradient_accumulation_steps=2,\n", + " optim=\"adamw_torch_fused\",\n", + " save_steps=100,\n", + " logging_steps=10,\n", + " learning_rate=3e-4,\n", + " weight_decay=0.001,\n", + " fp16=False,\n", + " bf16=True,\n", + " max_grad_norm=0.5,\n", + " warmup_ratio=0.02,\n", + " lr_scheduler_type=\"cosine\",\n", + " gradient_checkpointing=True,\n", + " report_to=\"none\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ffe93234-2a1a-4809-a431-efe2f35ce496", + "metadata": {}, + "source": [ + "## Load and Prepare Model" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8ab371f1-64c3-4225-82e7-fb3c5b05578c", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|██████████| 6/6 [00:04<00:00, 1.33it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "trainable params: 27,852,800 || all params: 14,687,360,000 || trainable%: 0.1896\n" + ] + } + ], + "source": [ + "# Load tokenizer\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n", + "tokenizer.pad_token = tokenizer.eos_token\n", + "tokenizer.padding_side = \"right\"\n", + "\n", + "# Load model with quantization\n", + "model = AutoModelForCausalLM.from_pretrained(\n", + " model_name,\n", + " quantization_config=bnb_config,\n", + " device_map=\"auto\",\n", + " trust_remote_code=True\n", + ")\n", + "\n", + "# Prepare model for k-bit training\n", + "model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)\n", + "\n", + "# Apply LoRA\n", + "model = get_peft_model(model, peft_config)\n", + "model.print_trainable_parameters()" + ] + }, + { + "cell_type": "markdown", + "id": "dd058fff-f6dd-4cc6-acaf-7e2fa2c1132d", + "metadata": {}, + "source": [ + "## Load and Prepare Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4392ddab-10b7-41f6-a8e0-65ba298ea457", + "metadata": {}, + "outputs": [], + "source": [ + "def format_prompt(example):\n", + " \"\"\"\n", + " Format the input example into a standardized prompt structure for training.\n", + " \n", + " This function creates a consistent instruction-response format for the LLM training:\n", + " - Includes instruction, optional input, and expected output\n", + " - Using a standardized template inspired by instruction-tuning datasets\n", + " \n", + " Parameters:\n", + " example (dict): Dictionary containing 'instruction', 'input' (optional), and 'output' fields\n", + " \n", + " Returns:\n", + " str: Formatted prompt with consistent structure for model training and evaluation\n", + " \"\"\"\n", + " # Handle case where input is provided\n", + " if example[\"input\"]:\n", + " # Format with both instruction and input fields\n", + " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n", + "\n", + "### Instruction:\n", + "{example['instruction']}\n", + "\n", + "### Input:\n", + "{example['input']}\n", + "\n", + "### Response:\n", + "{example['output']}\"\"\"\n", + " else:\n", + " # Format with only instruction (no input field)\n", + " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n", + "\n", + "### Instruction:\n", + "{example['instruction']}\n", + "\n", + "### Response:\n", + "{example['output']}\"\"\"\n", + "\n", + "# Load dataset\n", + "dataset = load_dataset(\"json\", data_files=dataset_name, split=\"train\", num_proc=4)\n", + "# Apply formatting to each example in the dataset\n", + "dataset = dataset.map(lambda x: {\"text\": format_prompt(x)}, num_proc=4)\n", + "\n", + "# Split dataset into training and evaluation subsets\n", + "dataset = dataset.train_test_split(test_size=0.1)\n", + "train_dataset = dataset[\"train\"]\n", + "eval_dataset = dataset[\"test\"]" + ] + }, + { + "cell_type": "markdown", + "id": "812cfcc8-33ec-4a2b-8a74-27bfc2a41d7b", + "metadata": {}, + "source": [ + "## Enhanced Training with SFTTrainer" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6dc85c57-68b2-4514-9373-43e3d7c05c10", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': dataset_text_field, max_seq_length, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "Generating train split: 1832 examples [00:02, 615.45 examples/s]\n", + "Generating train split: 197 examples [00:00, 575.95 examples/s]\n" + ] + } + ], + "source": [ + "trainer = SFTTrainer(\n", + " model=model,\n", + " train_dataset=train_dataset,\n", + " eval_dataset=eval_dataset,\n", + " peft_config=peft_config,\n", + " dataset_text_field=\"text\",\n", + " max_seq_length=1024,\n", + " tokenizer=tokenizer,\n", + " args=training_args,\n", + " packing=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "810eb75e", + "metadata": {}, + "source": [ + "## Federated Averaging Function" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "58298e8e-ab9e-4377-966e-143823441697", + "metadata": {}, + "outputs": [], + "source": [ + "def FedAvg(peft_params, model, weights=None):\n", + " \"\"\"\n", + " Perform Federated Averaging (FedAvg) on the model parameters.\n", + "\n", + " Parameters:\n", + " peft_params (list): A list of state dictionaries containing the model parameters from different clients.\n", + " model (torch.nn.Module): The model to which the averaged parameters will be applied.\n", + " weights (list, optional): A list of weights for averaging the parameters. If None, equal weights are used.\n", + "\n", + " Returns:\n", + " torch.nn.Module: The model with the averaged parameters applied.\n", + " \"\"\"\n", + " state_dicts = peft_params\n", + " state_dict = get_peft_model_state_dict(model)\n", + " for key in peft_params[0]:\n", + " dtype = state_dicts[0][key].dtype\n", + " state_dict[key] = torch.from_numpy(\n", + " np.average(\n", + " [state[key].cpu().to(torch.float).numpy() for state in state_dicts], axis=0, weights=weights\n", + " )\n", + " ).to(dtype)\n", + " set_peft_model_state_dict(model, state_dict)\n", + " return model" + ] + }, + { + "cell_type": "markdown", + "id": "e120a656-f4a5-47a5-a3d4-62c5f3672bba", + "metadata": {}, + "source": [ + "## Federated Learning Workflow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5e108c6-5150-4931-9c01-6b64a913fa04", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Aggregator step \"start\" registered\n", + "Collaborator step \"aggregated_model_validation\" registered\n", + "Collaborator step \"train\" registered\n", + "Collaborator step \"local_model_validation\" registered\n", + "Aggregator step \"join\" registered\n", + "Aggregator step \"end\" registered\n" + ] + } + ], + "source": [ + "# Import the required PrinterCallback for proper initialization/removal\n", + "from transformers.trainer_callback import PrinterCallback\n", + "import transformers\n", + "import gc\n", + "import psutil\n", + "\n", + "def get_gpu_memory_info():\n", + " \"\"\"Get detailed GPU memory usage information in megabytes.\n", + " \n", + " This function checks for CUDA availability and returns a dictionary with memory allocation\n", + " information including allocated, reserved, and maximum allocated GPU memory.\n", + " \n", + " Returns:\n", + " dict: Dictionary with memory usage information in MB:\n", + " - allocated: Currently allocated memory by PyTorch tensors\n", + " - reserved: Total memory reserved by PyTorch\n", + " - max_allocated: Maximum allocated memory since last reset\n", + " \n", + " Note:\n", + " Returns zeros for all metrics if CUDA is not available or if an error occurs.\n", + " \"\"\"\n", + " try:\n", + " if torch.cuda.is_available():\n", + " allocated = torch.cuda.memory_allocated() / (1024 * 1024)\n", + " reserved = torch.cuda.memory_reserved() / (1024 * 1024)\n", + " max_allocated = torch.cuda.max_memory_allocated() / (1024 * 1024)\n", + " return {\n", + " \"allocated\": allocated,\n", + " \"reserved\": reserved,\n", + " \"max_allocated\": max_allocated\n", + " }\n", + " else:\n", + " return {\"allocated\": 0, \"reserved\": 0, \"max_allocated\": 0}\n", + " except:\n", + " return {\"allocated\": 0, \"reserved\": 0, \"max_allocated\": 0}\n", + "\n", + "class MemoryTracker:\n", + " \"\"\"Track GPU memory usage during training\"\"\"\n", + " def __init__(self, collaborator_name, quant_type):\n", + " self.collaborator_name = collaborator_name\n", + " self.quant_type = quant_type\n", + " self.timestamps = {}\n", + " self.peak = {\"allocated\": 0, \"reserved\": 0, \"max_allocated\": 0}\n", + " self.training_loss = None\n", + " self.eval_loss = None\n", + " \n", + " def log(self, timestamp):\n", + " \"\"\"Log current memory usage at a specific timestamp\"\"\"\n", + " self.timestamps[timestamp] = get_gpu_memory_info()\n", + " \n", + " def log_loss(self, training_loss=None, eval_loss=None):\n", + " \"\"\"Log training or evaluation loss\"\"\"\n", + " if training_loss is not None:\n", + " self.training_loss = training_loss\n", + " if eval_loss is not None:\n", + " self.eval_loss = eval_loss\n", + " \n", + " def update_peak(self):\n", + " \"\"\"Update peak memory usage values\"\"\"\n", + " current = get_gpu_memory_info()\n", + " self.peak[\"allocated\"] = max(self.peak[\"allocated\"], current[\"allocated\"])\n", + " self.peak[\"reserved\"] = max(self.peak[\"reserved\"], current[\"reserved\"])\n", + " self.peak[\"max_allocated\"] = max(self.peak[\"max_allocated\"], current[\"max_allocated\"])\n", + " \n", + " def reset_peak(self):\n", + " \"\"\"Reset peak memory usage values\"\"\"\n", + " self.peak = {\"allocated\": 0, \"reserved\": 0, \"max_allocated\": 0}\n", + " \n", + " def report(self):\n", + " \"\"\"Print memory usage report\"\"\"\n", + " print(f\"\\n==== Memory Usage Report for {self.collaborator_name} ({self.quant_type}) ====\")\n", + " print(f\"Peak Memory Usage:\")\n", + " print(f\" Allocated: {self.peak['allocated']:.2f} MB\")\n", + " print(f\" Reserved: {self.peak['reserved']:.2f} MB\")\n", + " print(f\" Max Allocated: {self.peak['max_allocated']:.2f} MB\")\n", + " \n", + " print(\"\\nMemory Usage by Stage:\")\n", + " for timestamp, mem in self.timestamps.items():\n", + " print(f\" {timestamp}:\")\n", + " print(f\" Allocated: {mem['allocated']:.2f} MB\")\n", + " print(f\" Reserved: {mem['reserved']:.2f} MB\")\n", + " print(f\" Max Allocated: {mem['max_allocated']:.2f} MB\")\n", + " \n", + " print(\"\\nPerformance Metrics:\")\n", + " if self.training_loss is not None:\n", + " print(f\" Training Loss: {self.training_loss:.4f}\")\n", + " if self.eval_loss is not None:\n", + " print(f\" Evaluation Loss: {self.eval_loss:.4f}\")\n", + " print(\"-\" * 50)\n", + " \n", + " def get_stats(self):\n", + " \"\"\"Get all statistics as a dictionary\"\"\"\n", + " stats = {\n", + " \"peak_allocated\": self.peak[\"allocated\"],\n", + " \"peak_reserved\": self.peak[\"reserved\"],\n", + " \"peak_max_allocated\": self.peak[\"max_allocated\"],\n", + " \"quant_type\": self.quant_type,\n", + " \"training_loss\": self.training_loss,\n", + " \"eval_loss\": self.eval_loss\n", + " }\n", + " for timestamp, mem in self.timestamps.items():\n", + " stats[f\"{timestamp}_allocated\"] = mem[\"allocated\"]\n", + " stats[f\"{timestamp}_reserved\"] = mem[\"reserved\"]\n", + " stats[f\"{timestamp}_max_allocated\"] = mem[\"max_allocated\"]\n", + " return stats\n", + "\n", + "def plot_memory_metrics(flow_4bit, flow_8bit):\n", + " \"\"\"Plot and compare memory metrics between 4-bit and 8-bit quantization.\"\"\"\n", + " try:\n", + " import matplotlib.pyplot as plt\n", + " import pandas as pd\n", + " from matplotlib.ticker import EngFormatter\n", + "\n", + " # Create figure with multiple subplots\n", + " fig, axs = plt.subplots(2, 2, figsize=(16, 12))\n", + " fig.suptitle('4-bit vs 8-bit Quantization Comparison', fontsize=16)\n", + " \n", + " # Colors for consistent plotting\n", + " colors_4bit = {'Portland': 'blue', 'Seattle': 'green'}\n", + " colors_8bit = {'Portland': 'darkblue', 'Seattle': 'darkgreen'}\n", + " markers_4bit = {'Portland': 'o', 'Seattle': 's'}\n", + " markers_8bit = {'Portland': '^', 'Seattle': 'D'}\n", + " \n", + " # Flatten the metric data for plotting\n", + " memory_data = []\n", + " for quant, flow in [(\"4-bit\", flow_4bit), (\"8-bit\", flow_8bit)]:\n", + " stats = flow.all_memory_stats\n", + " for collab, rounds_data in stats.items():\n", + " for round_name, metrics in rounds_data.items():\n", + " round_num = int(round_name.split('_')[1])\n", + " row = {\n", + " 'Collaborator': collab,\n", + " 'Round': round_num,\n", + " 'Quantization': quant,\n", + " 'Peak Memory (MB)': metrics.get('peak_max_allocated', 0),\n", + " 'Training Loss': metrics.get('training_loss', 0),\n", + " 'Eval Loss': metrics.get('eval_loss', 0)\n", + " }\n", + " memory_data.append(row)\n", + " \n", + " df = pd.DataFrame(memory_data)\n", + " \n", + " # Plot 1: Peak Memory Usage by Round\n", + " axs[0, 0].set_title('Peak Memory Usage by Round')\n", + " for quant_type in ['4-bit', '8-bit']:\n", + " for collab in df['Collaborator'].unique():\n", + " subset = df[(df['Quantization'] == quant_type) & (df['Collaborator'] == collab)]\n", + " color = colors_4bit[collab] if quant_type == '4-bit' else colors_8bit[collab]\n", + " marker = markers_4bit[collab] if quant_type == '4-bit' else markers_8bit[collab]\n", + " axs[0, 0].plot(subset['Round'], subset['Peak Memory (MB)'], \n", + " marker=marker, linestyle='-', label=f\"{collab} ({quant_type})\",\n", + " color=color)\n", + " \n", + " axs[0, 0].set_xlabel('Round')\n", + " axs[0, 0].set_ylabel('Memory (MB)')\n", + " axs[0, 0].legend()\n", + " axs[0, 0].grid(True, alpha=0.3)\n", + " axs[0, 0].yaxis.set_major_formatter(EngFormatter(unit='B'))\n", + " \n", + " # Plot 2: Training Loss by Round\n", + " axs[0, 1].set_title('Training Loss by Round')\n", + " for quant_type in ['4-bit', '8-bit']:\n", + " for collab in df['Collaborator'].unique():\n", + " subset = df[(df['Quantization'] == quant_type) & (df['Collaborator'] == collab)]\n", + " color = colors_4bit[collab] if quant_type == '4-bit' else colors_8bit[collab]\n", + " marker = markers_4bit[collab] if quant_type == '4-bit' else markers_8bit[collab]\n", + " axs[0, 1].plot(subset['Round'], subset['Training Loss'], \n", + " marker=marker, linestyle='-', label=f\"{collab} ({quant_type})\",\n", + " color=color)\n", + " \n", + " axs[0, 1].set_xlabel('Round')\n", + " axs[0, 1].set_ylabel('Loss')\n", + " axs[0, 1].legend()\n", + " axs[0, 1].grid(True, alpha=0.3)\n", + " \n", + " # Plot 3: Eval Loss by Round\n", + " axs[1, 0].set_title('Evaluation Loss by Round')\n", + " for quant_type in ['4-bit', '8-bit']:\n", + " for collab in df['Collaborator'].unique():\n", + " subset = df[(df['Quantization'] == quant_type) & (df['Collaborator'] == collab)]\n", + " color = colors_4bit[collab] if quant_type == '4-bit' else colors_8bit[collab]\n", + " marker = markers_4bit[collab] if quant_type == '4-bit' else markers_8bit[collab]\n", + " axs[1, 0].plot(subset['Round'], subset['Eval Loss'], \n", + " marker=marker, linestyle='-', label=f\"{collab} ({quant_type})\",\n", + " color=color)\n", + " \n", + " axs[1, 0].set_xlabel('Round')\n", + " axs[1, 0].set_ylabel('Loss')\n", + " axs[1, 0].legend()\n", + " axs[1, 0].grid(True, alpha=0.3)\n", + " \n", + " # Plot 4: Memory vs Loss (bubble chart)\n", + " axs[1, 1].set_title('Memory Usage vs. Evaluation Loss')\n", + " for quant_type in ['4-bit', '8-bit']:\n", + " for collab in df['Collaborator'].unique():\n", + " subset = df[(df['Quantization'] == quant_type) & (df['Collaborator'] == collab)]\n", + " color = colors_4bit[collab] if quant_type == '4-bit' else colors_8bit[collab]\n", + " marker = markers_4bit[collab] if quant_type == '4-bit' else markers_8bit[collab]\n", + " \n", + " # Size proportional to round number for visual differentiation\n", + " sizes = [100 * (r+1) for r in subset['Round']]\n", + " \n", + " axs[1, 1].scatter(subset['Peak Memory (MB)'], subset['Eval Loss'],\n", + " s=sizes, alpha=0.7, \n", + " label=f\"{collab} ({quant_type})\",\n", + " color=color, marker=marker)\n", + " \n", + " # Add round number annotations\n", + " for _, row in subset.iterrows():\n", + " axs[1, 1].annotate(f\"R{int(row['Round'])}\", \n", + " (row['Peak Memory (MB)'], row['Eval Loss']),\n", + " xytext=(5, 5), textcoords='offset points')\n", + " \n", + " axs[1, 1].set_xlabel('Peak Memory (MB)')\n", + " axs[1, 1].set_ylabel('Evaluation Loss')\n", + " axs[1, 1].legend()\n", + " axs[1, 1].grid(True, alpha=0.3)\n", + " axs[1, 1].xaxis.set_major_formatter(EngFormatter(unit='B'))\n", + " \n", + " plt.tight_layout()\n", + " plt.subplots_adjust(top=0.92)\n", + " plt.show()\n", + " \n", + " # Print summary comparison\n", + " print(\"\\n==== Performance Summary ====\\n\")\n", + " # Group by quantization and compute means\n", + " summary = df.groupby('Quantization').agg({\n", + " 'Peak Memory (MB)': 'mean',\n", + " 'Training Loss': 'mean', \n", + " 'Eval Loss': 'mean'\n", + " }).reset_index()\n", + " \n", + " # Calculate percentage difference\n", + " mem_diff_pct = ((summary.loc[1, 'Peak Memory (MB)'] - summary.loc[0, 'Peak Memory (MB)']) / \n", + " summary.loc[0, 'Peak Memory (MB)'] * 100)\n", + " \n", + " eval_diff_pct = ((summary.loc[1, 'Eval Loss'] - summary.loc[0, 'Eval Loss']) / \n", + " summary.loc[0, 'Eval Loss'] * 100)\n", + " \n", + " print(f\"Memory Usage Comparison:\")\n", + " print(f\" 4-bit Avg: {summary.loc[0, 'Peak Memory (MB)']:.2f} MB\")\n", + " print(f\" 8-bit Avg: {summary.loc[1, 'Peak Memory (MB)']:.2f} MB\")\n", + " print(f\" Difference: {abs(mem_diff_pct):.1f}% {'more' if mem_diff_pct > 0 else 'less'} memory with 8-bit\")\n", + " \n", + " print(f\"\\nEvaluation Loss Comparison:\")\n", + " print(f\" 4-bit Avg: {summary.loc[0, 'Eval Loss']:.4f}\")\n", + " print(f\" 8-bit Avg: {summary.loc[1, 'Eval Loss']:.4f}\")\n", + " print(f\" Difference: {abs(eval_diff_pct):.1f}% {'higher' if eval_diff_pct > 0 else 'lower'} loss with 8-bit\")\n", + " \n", + " loss_efficiency = ((summary.loc[0, 'Eval Loss'] - summary.loc[1, 'Eval Loss']) / \n", + " (summary.loc[0, 'Peak Memory (MB)'] - summary.loc[1, 'Peak Memory (MB)']))\n", + " \n", + " if loss_efficiency > 0:\n", + " efficiency_msg = \"8-bit provides better memory efficiency with lower loss\"\n", + " else:\n", + " efficiency_msg = \"4-bit provides better memory efficiency with lower loss\"\n", + " \n", + " print(f\"\\nEfficiency Analysis: {efficiency_msg}\")\n", + " except ImportError:\n", + " print(\"Plotting requires matplotlib and pandas. Install with: pip install matplotlib pandas\")\n", + " except Exception as e:\n", + " print(f\"Error plotting metrics: {str(e)}\")\n", + "\n", + "class FederatedFlow(FLSpec):\n", + " def __init__(self, model=None, optimizer=None, rounds=3, quant_type=\"4bit\", **kwargs):\n", + " \"\"\"\n", + " Initialize the class with the given model, optimizer, and number of rounds.\n", + "\n", + " Parameters:\n", + " model (torch.nn.Module, optional): The model to be used. If None, a ValueError is raised.\n", + " optimizer (torch.optim.Optimizer, optional): The optimizer to be used.\n", + " rounds (int, optional): The number of rounds for training or processing (default is 3).\n", + " quant_type (str, optional): Quantization type, either \"4bit\" or \"8bit\".\n", + " **kwargs: Additional keyword arguments to be passed to the superclass initializer.\n", + "\n", + " Raises:\n", + " ValueError: If no model is provided.\n", + " \"\"\"\n", + " super().__init__(**kwargs)\n", + " if model is not None:\n", + " self.model = model\n", + " self.peft_params = get_peft_model_state_dict(self.model)\n", + " self.optimizer = optimizer\n", + " else:\n", + " raise ValueError(\"No model inputted\")\n", + "\n", + " self.rounds = rounds\n", + " self.quant_type = quant_type\n", + " # Initialize histories for tracking metrics over rounds\n", + " self.average_loss_history = []\n", + " self.agg_model_loss_history = []\n", + " self.local_model_loss_history = []\n", + " \n", + "\n", + " @aggregator\n", + " def start(self):\n", + " \"\"\"\n", + " Initialize the model and set up the collaborators for federated learning.\n", + "\n", + " This method performs the initial setup for the model, including setting the\n", + " collaborators, initializing private variables, and starting the first round\n", + " of the federated learning process.\n", + " \"\"\"\n", + " print(f\"Performing initialization for model with {self.quant_type} quantization\")\n", + " self.collaborators = self.runtime.collaborators\n", + " self.current_round = 0\n", + " # Initialize dictionary to collect memory stats\n", + " # Check if collaborators are objects with name attribute or strings\n", + " if hasattr(self.collaborators[0], 'name'):\n", + " collab_names = [c.name for c in self.collaborators]\n", + " else:\n", + " # If collaborators are already strings, use them directly\n", + " collab_names = self.collaborators\n", + " self.all_memory_stats = {collab: {} for collab in collab_names}\n", + " self.next(\n", + " self.aggregated_model_validation,\n", + " foreach=\"collaborators\",\n", + " )\n", + "\n", + " \n", + " @collaborator\n", + " def aggregated_model_validation(self):\n", + " \"\"\"\n", + " Perform aggregated model validation for a collaborator.\n", + "\n", + " This method loads the model, applies the PEFT configuration, and evaluates\n", + " the model using the provided training and evaluation datasets. The validation\n", + " score is then stored and the next step in the process is triggered.\n", + " \"\"\"\n", + " print(f\"Performing aggregated model validation for collaborator {self.input} with {self.quant_type}\")\n", + " # Initialize memory tracker for this collaborator\n", + " self.memory_tracker = MemoryTracker(self.input, self.quant_type)\n", + " self.memory_tracker.reset_peak()\n", + " \n", + " # Choose quantization config based on quant_type\n", + " if self.quant_type == \"4bit\":\n", + " quant_config = bnb_config_4bit\n", + " else: # 8bit\n", + " quant_config = bnb_config_8bit\n", + " \n", + " # Define device_map variable\n", + " #device_map = \"auto\"\n", + " device_map = {\"\": torch.cuda.current_device()} if torch.cuda.is_available() else \"cpu\"\n", + " try:\n", + " self.model = AutoModelForCausalLM.from_pretrained(\n", + " model_name,\n", + " quantization_config=quant_config,\n", + " device_map=device_map,\n", + " trust_remote_code=True\n", + " )\n", + " self.memory_tracker.log(\"model_load\")\n", + " except ValueError:\n", + " # Fallback to CPU if GPU memory is insufficient\n", + " print(f\"Falling back to CPU mode for {self.input}\")\n", + " self.model = AutoModelForCausalLM.from_pretrained(\n", + " model_name,\n", + " device_map=\"cpu\",\n", + " trust_remote_code=True\n", + " )\n", + " self.memory_tracker.log(\"model_load\")\n", + " \n", + " self.model = prepare_model_for_kbit_training(self.model)\n", + " self.model = get_peft_model(self.model, peft_config)\n", + " set_peft_model_state_dict(self.model, self.peft_params)\n", + " \n", + " trainer = SFTTrainer(\n", + " model=self.model,\n", + " args=training_args,\n", + " peft_config=peft_config,\n", + " train_dataset=self.train_dataset,\n", + " eval_dataset=self.eval_dataset,\n", + " max_seq_length=1024,\n", + " dataset_text_field=\"text\",\n", + " tokenizer=tokenizer,\n", + " packing=True,\n", + " data_collator=transformers.DataCollatorForSeq2Seq(\n", + " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n", + " ),\n", + " )\n", + "\n", + " trainer.remove_callback(PrinterCallback)\n", + " out = trainer.evaluate()\n", + " self.agg_validation_score = out[\"eval_loss\"]\n", + " print(f\"{self.input} value of {self.agg_validation_score}\")\n", + " self.memory_tracker.log_loss(eval_loss=self.agg_validation_score) # Log eval loss\n", + " self.memory_tracker.update_peak()\n", + " self.next(self.train)\n", + "\n", + " @collaborator\n", + " def train(self):\n", + " \"\"\"\n", + " Train the model for a collaborator.\n", + "\n", + " This method trains the model using the provided training and evaluation datasets.\n", + " The training loss is stored, the model is saved, and the next step in the process\n", + " is triggered.\n", + " \"\"\"\n", + " self.memory_tracker.log(\"before_training\")\n", + " trainer = SFTTrainer(\n", + " model=self.model,\n", + " args=training_args,\n", + " peft_config=peft_config,\n", + " train_dataset=self.train_dataset,\n", + " eval_dataset=self.eval_dataset,\n", + " max_seq_length=1024,\n", + " dataset_text_field=\"text\",\n", + " tokenizer=tokenizer,\n", + " packing=True,\n", + " data_collator=transformers.DataCollatorForSeq2Seq(\n", + " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n", + " ),\n", + " )\n", + "\n", + " out = trainer.train()\n", + " self.loss = out.training_loss\n", + " self.memory_tracker.log(\"after_training\")\n", + " self.memory_tracker.log_loss(training_loss=self.loss) # Log training loss\n", + " self.memory_tracker.update_peak()\n", + " trainer.save_model()\n", + " self.training_completed = True\n", + " self.next(self.local_model_validation)\n", + "\n", + " @collaborator\n", + " def local_model_validation(self):\n", + " \"\"\"\n", + " Perform local model validation for a collaborator.\n", + "\n", + " This method evaluates the model using the provided training and evaluation datasets.\n", + " The validation score is stored, the PEFT parameters are updated, and the next step\n", + " in the process is triggered.\n", + " \"\"\"\n", + " trainer = SFTTrainer(\n", + " model=self.model,\n", + " args=training_args,\n", + " peft_config=peft_config,\n", + " train_dataset=self.train_dataset,\n", + " eval_dataset=self.eval_dataset,\n", + " max_seq_length=1024,\n", + " dataset_text_field=\"text\",\n", + " tokenizer=tokenizer,\n", + " packing=True,\n", + " data_collator=transformers.DataCollatorForSeq2Seq(\n", + " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n", + " ),\n", + " )\n", + " out = trainer.evaluate()\n", + " self.local_validation_score = out[\"eval_loss\"]\n", + " self.memory_tracker.log_loss(eval_loss=self.local_validation_score) # Log eval loss\n", + " self.peft_params = get_peft_model_state_dict(self.model)\n", + " print(f\"Doing local model validation for collaborator {self.input}\")\n", + " \n", + " # Display memory report for this collaborator\n", + " self.memory_tracker.report()\n", + " self.memory_stats = self.memory_tracker.get_stats()\n", + " self.next(self.join, exclude=[\"training_completed\", \"model\", \"memory_tracker\"])\n", + "\n", + " @aggregator\n", + " def join(self, inputs):\n", + " \"\"\"\n", + " Aggregate the results from all collaborators and update the model.\n", + "\n", + " This method calculates the average loss, aggregated model accuracy, and local model\n", + " accuracy from all collaborators. The model parameters are updated using Federated\n", + " Averaging (FedAvg), and the next round of the process is triggered if applicable.\n", + " \"\"\"\n", + " self.average_loss = sum(input.loss for input in inputs) / len(inputs)\n", + " self.aggregated_model_accuracy = sum(\n", + " input.agg_validation_score for input in inputs\n", + " ) / len(inputs)\n", + " self.local_model_accuracy = sum(\n", + " input.local_validation_score for input in inputs\n", + " ) / len(inputs)\n", + " print(\n", + " f\"Average aggregated model validation values = {self.aggregated_model_accuracy}\"\n", + " )\n", + " print(f\"Average training loss = {self.average_loss}\")\n", + " print(f\"Average local model validation values = {self.local_model_accuracy}\")\n", + "\n", + " # Store metrics in history for plotting trends\n", + " self.average_loss_history.append(self.average_loss)\n", + " self.agg_model_loss_history.append(self.aggregated_model_accuracy)\n", + " self.local_model_loss_history.append(self.local_model_accuracy)\n", + " \n", + " # Collect memory stats from all collaborators for this round\n", + " for input_data in inputs:\n", + " self.all_memory_stats[input_data.input][f\"round_{self.current_round}\"] = input_data.memory_stats\n", + "\n", + " self.model = FedAvg([input.peft_params for input in inputs], self.model)\n", + " self.peft_params = get_peft_model_state_dict(self.model)\n", + "\n", + " self.model.save_pretrained(\"./aggregated/model\")\n", + " tokenizer.save_pretrained(\"./aggregated/tokenizer\")\n", + " self.current_round += 1\n", + " if self.current_round < self.rounds:\n", + " self.next(\n", + " self.aggregated_model_validation,\n", + " foreach=\"collaborators\",\n", + " exclude=[\"model\"],\n", + " )\n", + " else:\n", + " self.next(self.end)\n", + "\n", + " @aggregator\n", + " def end(self):\n", + " \"\"\"\n", + " End the federated learning process.\n", + "\n", + " This method marks the end of the federated learning process and performs any\n", + " necessary cleanup or finalization steps.\n", + " \"\"\"\n", + " print(f\"This is the end of the flow for {self.quant_type} quantization\")\n", + " print(\"\\n===== Final Metrics =====\\n\")\n", + " print(f\"Average Training Loss: {self.average_loss_history[-1]:.4f}\")\n", + " print(f\"Final Aggregated Model Loss: {self.agg_model_loss_history[-1]:.4f}\")\n", + " print(f\"Final Local Model Loss: {self.local_model_loss_history[-1]:.4f}\")\n", + " \n", + " print(\"\\n===== Memory Usage Summary Across All Rounds =====\\n\")\n", + " \n", + " # Print aggregated memory statistics\n", + " for collab, rounds_data in self.all_memory_stats.items():\n", + " print(f\"\\n==== {collab} Memory Usage Across Rounds ({self.quant_type}) ====\\n\")\n", + " for round_name, stats in rounds_data.items():\n", + " print(f\" {round_name}:\")\n", + " for metric, value in stats.items():\n", + " if value is not None:\n", + " if metric in ['training_loss', 'eval_loss', 'quant_type']:\n", + " if metric != 'quant_type':\n", + " print(f\" {metric}: {value:.4f}\")\n", + " else:\n", + " print(f\" {metric}: {value:.2f} MB\")\n", + " else:\n", + " print(f\" {metric}: Not recorded\")\n", + " print(\"-\" * 50)" + ] + }, + { + "cell_type": "markdown", + "id": "7bc8fe27", + "metadata": {}, + "source": [ + "## Run Federated Learning with 4-bit and 8-bit Quantization" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "38894111-41d9-4dd4-b1c8-eb7ec3cdd3e1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=============== Running with 4-bit Quantization ===============\n", + "\n", + "\n", + "Calling start\n", + "\u001b[94mPerforming initialization for model with 4bit quantization\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94mPerforming aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.27it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "Generating train split: 915 examples [00:01, 626.55 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "Generating train split: 96 examples [00:00, 570.74 examples/s]\u001b[0m\u001b[94m0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [12/12 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland value of 0.5819987058639526\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "Generating train split: 915 examples [00:01, 619.44 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "Generating train split: 96 examples [00:00, 628.10 examples/s]\u001b[0m\u001b[94m0m\u001b[94m\n", + "\u001b[0m`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [229/229 09:05, Epoch 1/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
100.501600
200.373200
300.337300
400.348300
500.334300
600.331800
700.322600
800.325800
900.320700
1000.328900
1100.303000
1200.313100
1300.312800
1400.320100
1500.308100
1600.312800
1700.325200
1800.307200
1900.320100
2000.310600
2100.308200
2200.310700

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [12/12 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 35323.02 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60420.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57461.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 32979.33 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 45302.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 48011.95 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 35078.37 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60420.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57461.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 35323.02 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 41244.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57461.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.3295\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.3029\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94mPerforming aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "Generating train split: 917 examples [00:01, 622.35 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "Generating train split: 100 examples [00:00, 631.84 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle value of 0.5914124846458435\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [229/229 09:07, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
100.487200
200.371700
300.346800
400.334800
500.324900
600.319300
700.331200
800.336300
900.310900
1000.308100
1100.313600
1200.312600
1300.318200
1400.323600
1500.304700
1600.323100
1700.305100
1800.314500
1900.303500
2000.314900
2100.327900
2200.314100

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 23170.02 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 48256.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 20890.32 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 33280.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 22957.36 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 48256.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 23170.02 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 29538.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.3287\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.3178\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94mAverage aggregated model validation values = 0.5867055952548981\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mAverage training loss = 0.3290792714039832\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mAverage local model validation values = 0.310357466340065\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94mPerforming aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.28it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [12/12 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland value of 0.30139902234077454\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [229/229 09:06, Epoch 1/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
100.298900
200.293900
300.279100
400.293300
500.290500
600.291100
700.280500
800.289900
900.289200
1000.298400
1100.275500
1200.290700
1300.290200
1400.301600
1500.290300
1600.299000
1700.313100
1800.297100
1900.313400
2000.303800
2100.302100
2200.306200

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [12/12 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 35535.52 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 59824.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33255.83 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 33440.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 35322.87 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 59824.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 35535.52 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 41948.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.2949\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.2986\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94mPerforming aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.27it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle value of 0.3157660961151123\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [229/229 09:07, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
100.297100
200.287600
300.288400
400.284300
500.282800
600.281400
700.292800
800.296100
900.276100
1000.277400
1100.283700
1200.288200
1300.296800
1400.301800
1500.285500
1600.308200
1700.292100
1800.305600
1900.296300
2000.306900
2100.321900
2200.309400

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 35535.52 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 59804.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57918.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33255.83 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 33424.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57918.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 35322.87 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 59804.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57918.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 35535.52 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 41528.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57918.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.2942\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.3126\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94mAverage aggregated model validation values = 0.3085825592279434\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mAverage training loss = 0.29453011579388616\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mAverage local model validation values = 0.30557093024253845\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling end\n", + "\u001b[94mThis is the end of the flow for 4bit quantization\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "===== Final Metrics =====\n", + "\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mAverage Training Loss: 0.2945\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mFinal Aggregated Model Loss: 0.3086\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mFinal Local Model Loss: 0.3056\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "===== Memory Usage Summary Across All Rounds =====\n", + "\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Portland Memory Usage Across Rounds (4bit) ====\n", + "\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 35323.02 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 60420.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57461.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.3295\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.3029\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 32979.33 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 45302.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 48011.95 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 35078.37 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 60420.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57461.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 35323.02 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 41244.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57461.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 35535.52 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 59824.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57706.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.2949\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.2986\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 33255.83 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 33440.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57706.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 35322.87 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 59824.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57706.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 35535.52 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 41948.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57706.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Seattle Memory Usage Across Rounds (4bit) ====\n", + "\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 23170.02 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 48256.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57706.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.3287\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.3178\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 20890.32 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 33280.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57706.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 22957.36 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 48256.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57706.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 23170.02 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 29538.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57706.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 35535.52 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 59804.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57918.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.2942\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.3126\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 33255.83 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 33424.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57918.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 35322.87 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 59804.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57918.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 35535.52 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 41528.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57918.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mCleared CUDA cache between runs\n", + "\n", + "=============== Running with 8-bit Quantization ===============\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|██████████| 6/6 [00:04<00:00, 1.36it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling start\n", + "\u001b[94mPerforming initialization for model with 8bit quantization\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94mPerforming aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.30it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [12/12 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland value of 0.5662918090820312\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [229/229 13:26, Epoch 1/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
100.494400
200.367000
300.331100
400.342600
500.329000
600.326600
700.315600
800.320400
900.316400
1000.323500
1100.297700
1200.309000
1300.308100
1400.315500
1500.303500
1600.307800
1700.320400
1800.304000
1900.314900
2000.305600
2100.303100
2200.306500

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [12/12 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 75278.13 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 93310.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 92214.55 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 72811.06 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 91120.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 91914.16 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 75278.13 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 93310.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 92214.55 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 75250.57 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 83158.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 92214.55 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.3243\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.2989\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94mPerforming aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.28it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle value of 0.5757399201393127\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [229/229 13:29, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
100.481300
200.365600
300.344500
400.331200
500.320000
600.314700
700.326100
800.331600
900.306900
1000.304800
1100.309200
1200.308300
1300.313500
1400.318300
1500.299200
1600.318800
1700.300500
1800.310100
1900.299400
2000.310000
2100.323100
2200.310700

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 58276.76 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 83620.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 92551.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 55775.01 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 74152.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 92551.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 58145.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 83620.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 92551.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 58276.76 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 65726.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 92551.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.3242\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.3134\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94mAverage aggregated model validation values = 0.571015864610672\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mAverage training loss = 0.3242545044578319\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mAverage local model validation values = 0.30610978603363037\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94mPerforming aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.28it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [12/12 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland value of 0.296934574842453\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [229/229 13:24, Epoch 1/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
100.295100
200.290700
300.277600
400.291800
500.286200
600.288300
700.277600
800.286000
900.285200
1000.295400
1100.272500
1200.288300
1300.288100
1400.298300
1500.287800
1600.295400
1700.309200
1800.293700
1900.308100
2000.299100
2100.296900
2200.301700

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [12/12 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 75488.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 93552.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 92552.04 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 73023.56 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 74272.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 92551.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 75488.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 93552.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 92552.04 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 75466.38 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 82944.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 92552.04 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.2914\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.2939\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94mPerforming aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.28it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle value of 0.31116044521331787\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [229/229 13:28, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
100.294400
200.284700
300.289000
400.282800
500.279500
600.277900
700.289600
800.294600
900.275700
1000.276200
1100.280600
1200.284700
1300.292600
1400.297800
1500.281700
1600.305000
1700.288500
1800.301900
1900.292400
2000.303000
2100.317300
2200.305900

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 75529.57 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 93476.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 92767.32 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 73023.56 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 74242.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 92767.32 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 75388.56 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 93476.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 92767.32 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 75529.57 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 83044.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 92767.32 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.2912\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.3080\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94mAverage aggregated model validation values = 0.30404751002788544\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mAverage training loss = 0.29131152119699005\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mAverage local model validation values = 0.3009362369775772\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling end\n", + "\u001b[94mThis is the end of the flow for 8bit quantization\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "===== Final Metrics =====\n", + "\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mAverage Training Loss: 0.2913\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mFinal Aggregated Model Loss: 0.3040\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mFinal Local Model Loss: 0.3009\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "===== Memory Usage Summary Across All Rounds =====\n", + "\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Portland Memory Usage Across Rounds (8bit) ====\n", + "\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 75278.13 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 93310.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 92214.55 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.3243\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.2989\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 72811.06 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 91120.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 91914.16 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 75278.13 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 93310.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 92214.55 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 75250.57 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 83158.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 92214.55 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 75488.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 93552.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 92552.04 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.2914\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.2939\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 73023.56 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 74272.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 92551.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 75488.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 93552.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 92552.04 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 75466.38 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 82944.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 92552.04 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Seattle Memory Usage Across Rounds (8bit) ====\n", + "\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 58276.76 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 83620.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 92551.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.3242\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.3134\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 55775.01 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 74152.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 92551.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 58145.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 83620.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 92551.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 58276.76 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 65726.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 92551.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 75529.57 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 93476.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 92767.32 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.2912\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.3080\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 73023.56 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 74242.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 92767.32 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 75388.56 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 93476.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 92767.32 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 75529.57 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 83044.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 92767.32 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "# Setup participants\n", + "aggregator = Aggregator()\n", + "collaborators = [\n", + " Collaborator(name=\"Portland\"),\n", + " Collaborator(name=\"Seattle\")\n", + "]\n", + "\n", + "# Assign data shards\n", + "for idx, colab in enumerate(collaborators):\n", + " colab.private_attributes = {\n", + " \"train_dataset\": train_dataset.shard(len(collaborators), idx),\n", + " \"eval_dataset\": eval_dataset.shard(len(collaborators), idx)\n", + " }\n", + "\n", + "# First run with 4-bit quantization\n", + "print(\"\\n=============== Running with 4-bit Quantization ===============\\n\")\n", + "bnb_config = bnb_config_4bit # Set active config to 4-bit\n", + "runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators)\n", + "flflow_4bit = FederatedFlow(model, rounds=2, quant_type=\"4bit\")\n", + "flflow_4bit.runtime = runtime\n", + "flflow_4bit.run()\n", + "\n", + "# Clean up CUDA cache between runs\n", + "if torch.cuda.is_available():\n", + " torch.cuda.empty_cache()\n", + " print(\"Cleared CUDA cache between runs\")\n", + "\n", + "# Then run with 8-bit quantization\n", + "print(\"\\n=============== Running with 8-bit Quantization ===============\\n\")\n", + "bnb_config = bnb_config_8bit # Set active config to 8-bit\n", + "# Reload the model with 8-bit quantization\n", + "model_8bit = AutoModelForCausalLM.from_pretrained(\n", + " model_name,\n", + " quantization_config=bnb_config_8bit,\n", + " device_map=\"auto\",\n", + " trust_remote_code=True\n", + ")\n", + "model_8bit = prepare_model_for_kbit_training(model_8bit)\n", + "model_8bit = get_peft_model(model_8bit, peft_config)\n", + "\n", + "runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators)\n", + "flflow_8bit = FederatedFlow(model_8bit, rounds=2, quant_type=\"8bit\")\n", + "flflow_8bit.runtime = runtime\n", + "flflow_8bit.run()" + ] + }, + { + "cell_type": "markdown", + "id": "6e1268ae", + "metadata": {}, + "source": [ + "## Compare VRAM Usage and Training Loss Between 4-bit and 8-bit" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "6f5caae6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "==== Memory Usage Comparison: 4-bit vs 8-bit ====\n", + "\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "==== Performance Summary ====\n", + "\n", + "Memory Usage Comparison:\n", + " 4-bit Avg: 57698.24 MB\n", + " 8-bit Avg: 92521.30 MB\n", + " Difference: 60.4% more memory with 8-bit\n", + "\n", + "Evaluation Loss Comparison:\n", + " 4-bit Avg: 0.3080\n", + " 8-bit Avg: 0.3035\n", + " Difference: 1.4% lower loss with 8-bit\n", + "\n", + "Efficiency Analysis: 4-bit provides better memory efficiency with lower loss\n" + ] + } + ], + "source": [ + "# Plot memory metrics comparison between 4-bit and 8-bit\n", + "print(\"\\n==== Memory Usage Comparison: 4-bit vs 8-bit ====\\n\")\n", + "plot_memory_metrics(flflow_4bit, flflow_8bit)" + ] + }, + { + "cell_type": "markdown", + "id": "19dfbe72", + "metadata": {}, + "source": [ + "## Analysis and Conclusions" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "5da1c53e", + "metadata": {}, + "outputs": [], + "source": [ + "# Utility functions to monitor GPU memory usage\n", + "def get_gpu_memory_info():\n", + " \"\"\"Get current GPU memory usage in MB\"\"\"\n", + " if torch.cuda.is_available():\n", + " # Get the current device\n", + " device = torch.cuda.current_device()\n", + " # Get memory information\n", + " total_memory = torch.cuda.get_device_properties(device).total_memory / 1024**2 # MB\n", + " allocated_memory = torch.cuda.memory_allocated(device) / 1024**2 # MB\n", + " reserved_memory = torch.cuda.memory_reserved(device) / 1024**2 # MB\n", + " free_memory = total_memory - allocated_memory\n", + " \n", + " return {\n", + " \"device\": device,\n", + " \"total_memory_mb\": total_memory,\n", + " \"allocated_memory_mb\": allocated_memory,\n", + " \"reserved_memory_mb\": reserved_memory,\n", + " \"free_memory_mb\": free_memory\n", + " }\n", + " else:\n", + " return {\"device\": \"cpu\", \"error\": \"CUDA not available\"}\n", + "\n", + "class MemoryTracker:\n", + " \"\"\"Track memory usage across training phases\"\"\"\n", + " def __init__(self, collaborator_name, quant_type=\"4bit\"):\n", + " self.collaborator_name = collaborator_name\n", + " self.quant_type = quant_type # Track whether this is 4bit or 8bit\n", + " self.memory_log = {\n", + " \"model_load\": None,\n", + " \"after_trainer_init\": None,\n", + " \"before_training\": None,\n", + " \"after_training\": None,\n", + " \"peak_memory\": None\n", + " }\n", + " self.peak_memory = 0\n", + " self.reset_peak()\n", + " \n", + " def update_peak(self):\n", + " \"\"\"Update peak memory usage\"\"\"\n", + " if torch.cuda.is_available():\n", + " self.peak_memory = torch.cuda.max_memory_allocated() / 1024**2 # MB\n", + " self.memory_log[\"peak_memory\"] = self.peak_memory\n", + " \n", + " def log_memory(self, phase):\n", + " \"\"\"Log memory usage at a specific phase\"\"\"\n", + " if phase in self.memory_log:\n", + " self.memory_log[phase] = get_gpu_memory_info()[\"allocated_memory_mb\"] if torch.cuda.is_available() else 0\n", + " self.update_peak()\n", + " \n", + " def reset_peak(self):\n", + " \"\"\"Reset peak memory stats\"\"\"\n", + " if torch.cuda.is_available():\n", + " torch.cuda.reset_peak_memory_stats()\n", + " \n", + " def report(self):\n", + " \"\"\"Print memory usage report\"\"\"\n", + " print(f\"\\n==== Memory Usage Report for {self.collaborator_name} ====\")\n", + " for phase, memory in self.memory_log.items():\n", + " if memory is not None:\n", + " print(f\"{phase}: {memory:.2f} MB\")\n", + " else:\n", + " print(f\"{phase}: Not measured\")\n", + " print(f\"Quantization type: {self.quant_type}\")\n", + " print(\"=\"*50)\n", + " \n", + " def get_stats(self):\n", + " \"\"\"Get all stats in a dictionary format for aggregation\"\"\"\n", + " stats = {k: v for k, v in self.memory_log.items()}\n", + " stats[\"training_loss\"] = self.training_loss\n", + " stats[\"eval_loss\"] = self.eval_loss\n", + " stats[\"quant_type\"] = self.quant_type\n", + " return stats" + ] + }, + { + "cell_type": "markdown", + "id": "78f47784", + "metadata": {}, + "source": [ + "## Memory and Loss Monitoring Utilities" + ] + }, + { + "cell_type": "markdown", + "id": "805feb3c", + "metadata": {}, + "source": [ + "## Visualization Functions" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "cf20ad92", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_loss_metrics(flow_4bit, flow_8bit):\n", + " \"\"\"Plot and compare loss metrics between 4-bit and 8-bit quantization.\"\"\"\n", + " try:\n", + " import matplotlib.pyplot as plt\n", + " import pandas as pd\n", + " \n", + " # Create figure with two subplots\n", + " fig, axs = plt.subplots(1, 2, figsize=(16, 6))\n", + " fig.suptitle('Training and Evaluation Loss: 4-bit vs 8-bit Quantization', fontsize=16)\n", + " \n", + " # Prepare data\n", + " rounds = list(range(1, len(flow_4bit.average_loss_history) + 1))\n", + " \n", + " # Plot training loss\n", + " axs[0].set_title('Training Loss by Round')\n", + " axs[0].plot(rounds, flow_4bit.average_loss_history, 'o-', label='4-bit', color='blue')\n", + " axs[0].plot(rounds, flow_8bit.average_loss_history, 's-', label='8-bit', color='red')\n", + " axs[0].set_xlabel('Round')\n", + " axs[0].set_ylabel('Average Training Loss')\n", + " axs[0].legend()\n", + " axs[0].grid(True, alpha=0.3)\n", + " \n", + " # Plot evaluation loss\n", + " axs[1].set_title('Evaluation Loss by Round')\n", + " axs[1].plot(rounds, flow_4bit.local_model_loss_history, 'o-', label='4-bit (Local)', color='blue')\n", + " axs[1].plot(rounds, flow_8bit.local_model_loss_history, 's-', label='8-bit (Local)', color='red')\n", + " axs[1].plot(rounds, flow_4bit.agg_model_loss_history, 'o--', label='4-bit (Agg)', color='lightblue')\n", + " axs[1].plot(rounds, flow_8bit.agg_model_loss_history, 's--', label='8-bit (Agg)', color='salmon')\n", + " axs[1].set_xlabel('Round')\n", + " axs[1].set_ylabel('Evaluation Loss')\n", + " axs[1].legend()\n", + " axs[1].grid(True, alpha=0.3)\n", + " \n", + " plt.tight_layout()\n", + " plt.subplots_adjust(top=0.88)\n", + " plt.show()\n", + " \n", + " # Print textual summary\n", + " print(\"\\nLoss Metrics Summary:\")\n", + " print(f\"Final Training Loss: 4-bit = {flow_4bit.average_loss_history[-1]:.4f}, 8-bit = {flow_8bit.average_loss_history[-1]:.4f}\")\n", + " print(f\"Training Loss Difference: {abs(flow_4bit.average_loss_history[-1] - flow_8bit.average_loss_history[-1]):.4f}\")\n", + " \n", + " print(f\"\\nFinal Local Eval Loss: 4-bit = {flow_4bit.local_model_loss_history[-1]:.4f}, 8-bit = {flow_8bit.local_model_loss_history[-1]:.4f}\")\n", + " print(f\"Local Eval Loss Difference: {abs(flow_4bit.local_model_loss_history[-1] - flow_8bit.local_model_loss_history[-1]):.4f}\")\n", + " \n", + " print(f\"\\nFinal Aggregated Eval Loss: 4-bit = {flow_4bit.agg_model_loss_history[-1]:.4f}, 8-bit = {flow_8bit.agg_model_loss_history[-1]:.4f}\")\n", + " print(f\"Aggregated Eval Loss Difference: {abs(flow_4bit.agg_model_loss_history[-1] - flow_8bit.agg_model_loss_history[-1]):.4f}\")\n", + " \n", + " better_training = \"4-bit\" if flow_4bit.average_loss_history[-1] < flow_8bit.average_loss_history[-1] else \"8-bit\"\n", + " better_local = \"4-bit\" if flow_4bit.local_model_loss_history[-1] < flow_8bit.local_model_loss_history[-1] else \"8-bit\"\n", + " better_agg = \"4-bit\" if flow_4bit.agg_model_loss_history[-1] < flow_8bit.agg_model_loss_history[-1] else \"8-bit\"\n", + " \n", + " print(f\"\\nBest Training Performance: {better_training}\")\n", + " print(f\"Best Local Evaluation Performance: {better_local}\")\n", + " print(f\"Best Aggregated Evaluation Performance: {better_agg}\")\n", + " \n", + " except ImportError:\n", + " print(\"Plotting requires matplotlib and pandas. Install with: pip install matplotlib pandas\")\n", + " except Exception as e:\n", + " print(f\"Error plotting metrics: {str(e)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "2668f39a-537e-4b4e-abfa-6b297e3aaa36", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Visualization libraries installed and imported successfully.\n" + ] + } + ], + "source": [ + "!pip install seaborn matplotlib pandas -q\n", + "\n", + "# Import the libraries\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "\n", + "print(\"Visualization libraries installed and imported successfully.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "35cb4b3a", + "metadata": {}, + "outputs": [], + "source": [ + "# Visualization functions for memory usage and training loss\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "\n", + "def plot_memory_metrics(flow_4bit, flow_8bit=None):\n", + " \"\"\"Plot memory usage metrics comparing 4-bit and 8-bit quantization\"\"\"\n", + " # Extract and organize memory data for 4-bit\n", + " memory_data = []\n", + " \n", + " for collab, rounds_data in flow_4bit.all_memory_stats.items():\n", + " for round_name, stats in rounds_data.items():\n", + " # Extract round number from round_name (e.g., 'round_0' -> 0)\n", + " round_num = int(round_name.split('_')[1]) if '_' in round_name else 0\n", + " quant_type = stats.get(\"quant_type\", \"4bit\")\n", + " \n", + " # Extract memory data\n", + " for phase, memory in stats.items():\n", + " if memory is not None and phase not in ['training_loss', 'eval_loss', 'quant_type']:\n", + " memory_data.append({\n", + " \"Collaborator\": collab,\n", + " \"Round\": round_name,\n", + " \"Round Number\": round_num,\n", + " \"Phase\": phase,\n", + " \"Memory (MB)\": memory,\n", + " \"Quantization\": quant_type\n", + " })\n", + " \n", + " # Add 8-bit data if provided\n", + " if flow_8bit is not None:\n", + " for collab, rounds_data in flow_8bit.all_memory_stats.items():\n", + " for round_name, stats in rounds_data.items():\n", + " # Extract round number from round_name (e.g., 'round_0' -> 0)\n", + " round_num = int(round_name.split('_')[1]) if '_' in round_name else 0\n", + " quant_type = stats.get(\"quant_type\", \"8bit\")\n", + " \n", + " # Extract memory data\n", + " for phase, memory in stats.items():\n", + " if memory is not None and phase not in ['training_loss', 'eval_loss', 'quant_type']:\n", + " memory_data.append({\n", + " \"Collaborator\": collab,\n", + " \"Round\": round_name,\n", + " \"Round Number\": round_num,\n", + " \"Phase\": phase,\n", + " \"Memory (MB)\": memory,\n", + " \"Quantization\": quant_type\n", + " })\n", + " \n", + " if not memory_data:\n", + " print(\"No memory data collected\")\n", + " return\n", + " \n", + " memory_df = pd.DataFrame(memory_data)\n", + " \n", + " # Create a figure with subplots for memory metrics\n", + " fig, axes = plt.subplots(2, 1, figsize=(15, 14), gridspec_kw={'height_ratios': [1, 0.7]})\n", + " \n", + " # 1. Memory usage by phase for each quantization (top plot)\n", + " if flow_8bit is not None:\n", + " sns.barplot(x=\"Phase\", y=\"Memory (MB)\", hue=\"Quantization\", data=memory_df, ax=axes[0])\n", + " axes[0].set_title(\"Memory Usage by Phase and Quantization Type\", fontsize=14, fontweight='bold')\n", + " else:\n", + " sns.barplot(x=\"Phase\", y=\"Memory (MB)\", hue=\"Collaborator\", data=memory_df, ax=axes[0])\n", + " axes[0].set_title(\"Memory Usage by Phase and Collaborator\", fontsize=14, fontweight='bold')\n", + " \n", + " axes[0].set_xlabel(\"Phase\", fontsize=12)\n", + " axes[0].set_ylabel(\"Memory (MB)\", fontsize=12)\n", + " axes[0].tick_params(axis='x', rotation=45)\n", + " axes[0].legend(title=\"Quantization\" if flow_8bit else \"Collaborator\", bbox_to_anchor=(1.05, 1), loc='upper left')\n", + " \n", + " # 2. Peak memory across rounds (bottom plot)\n", + " peak_data = memory_df[memory_df[\"Phase\"] == \"peak\"]\n", + " if not peak_data.empty:\n", + " group_var = \"Quantization\" if flow_8bit else \"Collaborator\"\n", + " sns.lineplot(\n", + " x=\"Round Number\", \n", + " y=\"Memory (MB)\", \n", + " hue=group_var, \n", + " data=peak_data, \n", + " marker='o', \n", + " sort=True,\n", + " linewidth=3,\n", + " markersize=10,\n", + " ax=axes[1]\n", + " )\n", + " axes[1].set_title(\"Peak Memory Usage Across Rounds\", fontsize=14, fontweight='bold')\n", + " axes[1].set_xlabel(\"Round\", fontsize=12)\n", + " axes[1].set_ylabel(\"Memory (MB)\", fontsize=12)\n", + " axes[1].legend(title=group_var, bbox_to_anchor=(1.05, 1), loc='upper left')\n", + " axes[1].grid(True, linestyle='--', alpha=0.7)\n", + " \n", + " plt.tight_layout()\n", + " plt.savefig('memory_metrics_comparison.png', dpi=300, bbox_inches='tight')\n", + " plt.show()\n", + "\n", + "def plot_loss_metrics(flow_4bit, flow_8bit=None):\n", + " \"\"\"Plot training and evaluation loss metrics comparing 4-bit and 8-bit quantization\"\"\"\n", + " # Extract and organize loss data for 4-bit\n", + " loss_data = []\n", + " \n", + " for collab, rounds_data in flow_4bit.all_memory_stats.items():\n", + " for round_name, stats in rounds_data.items():\n", + " # Extract round number from round_name (e.g., 'round_0' -> 0)\n", + " round_num = int(round_name.split('_')[1]) if '_' in round_name else 0\n", + " quant_type = stats.get(\"quant_type\", \"4bit\")\n", + " \n", + " # Extract loss data\n", + " if 'training_loss' in stats and stats['training_loss'] is not None:\n", + " loss_data.append({\n", + " \"Collaborator\": collab,\n", + " \"Round\": round_name,\n", + " \"Round Number\": round_num,\n", + " \"Metric\": \"Training Loss\",\n", + " \"Value\": stats['training_loss'],\n", + " \"Quantization\": quant_type\n", + " })\n", + " if 'eval_loss' in stats and stats['eval_loss'] is not None:\n", + " loss_data.append({\n", + " \"Collaborator\": collab,\n", + " \"Round\": round_name,\n", + " \"Round Number\": round_num,\n", + " \"Metric\": \"Evaluation Loss\",\n", + " \"Value\": stats['eval_loss'],\n", + " \"Quantization\": quant_type\n", + " })\n", + " \n", + " # Add 8-bit data if provided\n", + " if flow_8bit is not None:\n", + " for collab, rounds_data in flow_8bit.all_memory_stats.items():\n", + " for round_name, stats in rounds_data.items():\n", + " # Extract round number from round_name (e.g., 'round_0' -> 0)\n", + " round_num = int(round_name.split('_')[1]) if '_' in round_name else 0\n", + " quant_type = stats.get(\"quant_type\", \"8bit\")\n", + " \n", + " # Extract loss data\n", + " if 'training_loss' in stats and stats['training_loss'] is not None:\n", + " loss_data.append({\n", + " \"Collaborator\": collab,\n", + " \"Round\": round_name,\n", + " \"Round Number\": round_num,\n", + " \"Metric\": \"Training Loss\",\n", + " \"Value\": stats['training_loss'],\n", + " \"Quantization\": quant_type\n", + " })\n", + " if 'eval_loss' in stats and stats['eval_loss'] is not None:\n", + " loss_data.append({\n", + " \"Collaborator\": collab,\n", + " \"Round\": round_name,\n", + " \"Round Number\": round_num,\n", + " \"Metric\": \"Evaluation Loss\",\n", + " \"Value\": stats['eval_loss'],\n", + " \"Quantization\": quant_type\n", + " })\n", + " \n", + " if not loss_data:\n", + " print(\"No loss data collected\")\n", + " return\n", + " \n", + " loss_df = pd.DataFrame(loss_data)\n", + " \n", + " # Create a figure with subplots for loss metrics\n", + " if flow_8bit is None:\n", + " fig, axes = plt.subplots(2, 1, figsize=(15, 12), gridspec_kw={'height_ratios': [1, 0.8]})\n", + " \n", + " # 1. Training and eval loss per round (top plot)\n", + " sns.lineplot(\n", + " x=\"Round Number\", \n", + " y=\"Value\", \n", + " hue=\"Collaborator\", \n", + " style=\"Metric\", \n", + " data=loss_df, \n", + " marker='o', \n", + " sort=True,\n", + " linewidth=3,\n", + " markersize=10,\n", + " ax=axes[0]\n", + " )\n", + " axes[0].set_title(\"Training and Evaluation Loss by Round\", fontsize=14, fontweight='bold')\n", + " axes[0].set_xlabel(\"Round\", fontsize=12)\n", + " axes[0].set_ylabel(\"Loss\", fontsize=12)\n", + " axes[0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + " axes[0].grid(True, linestyle='--', alpha=0.7)\n", + " \n", + " # 2. Boxplot of loss distribution by round (bottom plot)\n", + " sns.boxplot(x=\"Round\", y=\"Value\", hue=\"Metric\", data=loss_df, ax=axes[1])\n", + " axes[1].set_title(\"Loss Distribution Across Rounds\", fontsize=14, fontweight='bold')\n", + " axes[1].set_xlabel(\"Round\", fontsize=12)\n", + " axes[1].set_ylabel(\"Loss Value\", fontsize=12)\n", + " axes[1].legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + " else:\n", + " # Comparison between 4-bit and 8-bit\n", + " fig, axes = plt.subplots(2, 1, figsize=(15, 12), gridspec_kw={'height_ratios': [1, 1]})\n", + " \n", + " # 1. Training loss comparison\n", + " training_loss_df = loss_df[loss_df[\"Metric\"] == \"Training Loss\"]\n", + " sns.lineplot(\n", + " x=\"Round Number\", \n", + " y=\"Value\", \n", + " hue=\"Quantization\", \n", + " style=\"Collaborator\", \n", + " data=training_loss_df, \n", + " marker='o', \n", + " sort=True,\n", + " linewidth=3,\n", + " markersize=10,\n", + " ax=axes[0]\n", + " )\n", + " axes[0].set_title(\"Training Loss Comparison: 4-bit vs 8-bit\", fontsize=14, fontweight='bold')\n", + " axes[0].set_xlabel(\"Round\", fontsize=12)\n", + " axes[0].set_ylabel(\"Training Loss\", fontsize=12)\n", + " axes[0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + " axes[0].grid(True, linestyle='--', alpha=0.7)\n", + " \n", + " # 2. Evaluation loss comparison\n", + " eval_loss_df = loss_df[loss_df[\"Metric\"] == \"Evaluation Loss\"]\n", + " sns.lineplot(\n", + " x=\"Round Number\", \n", + " y=\"Value\", \n", + " hue=\"Quantization\", \n", + " style=\"Collaborator\", \n", + " data=eval_loss_df, \n", + " marker='o', \n", + " sort=True,\n", + " linewidth=3,\n", + " markersize=10,\n", + " ax=axes[1]\n", + " )\n", + " axes[1].set_title(\"Evaluation Loss Comparison: 4-bit vs 8-bit\", fontsize=14, fontweight='bold')\n", + " axes[1].set_xlabel(\"Round\", fontsize=12)\n", + " axes[1].set_ylabel(\"Evaluation Loss\", fontsize=12)\n", + " axes[1].legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + " axes[1].grid(True, linestyle='--', alpha=0.7)\n", + " \n", + " plt.tight_layout()\n", + " plt.savefig('loss_metrics_comparison.png', dpi=300, bbox_inches='tight')\n", + " plt.show()\n", + "\n", + "def plot_aggregated_metrics(flow_4bit, flow_8bit):\n", + " \"\"\"Plot aggregated metrics comparing 4-bit and 8-bit quantization\"\"\"\n", + " if (not hasattr(flow_4bit, 'average_loss_history') or not flow_4bit.average_loss_history or\n", + " not hasattr(flow_8bit, 'average_loss_history') or not flow_8bit.average_loss_history):\n", + " print(\"Not enough aggregated metrics data available\")\n", + " return\n", + " \n", + " # Create comparison dataframes\n", + " rounds_4bit = list(range(len(flow_4bit.average_loss_history)))\n", + " data_4bit = pd.DataFrame({\n", + " 'Round': rounds_4bit,\n", + " 'Average Training Loss': flow_4bit.average_loss_history,\n", + " 'Aggregated Model Loss': flow_4bit.agg_model_loss_history,\n", + " 'Local Model Loss': flow_4bit.local_model_loss_history,\n", + " 'Quantization': '4-bit'\n", + " })\n", + " \n", + " rounds_8bit = list(range(len(flow_8bit.average_loss_history)))\n", + " data_8bit = pd.DataFrame({\n", + " 'Round': rounds_8bit,\n", + " 'Average Training Loss': flow_8bit.average_loss_history,\n", + " 'Aggregated Model Loss': flow_8bit.agg_model_loss_history,\n", + " 'Local Model Loss': flow_8bit.local_model_loss_history,\n", + " 'Quantization': '8-bit'\n", + " })\n", + " \n", + " # Combine data\n", + " combined_data = pd.concat([data_4bit, data_8bit])\n", + " \n", + " # Melt for easier plotting\n", + " melted_data = pd.melt(\n", + " combined_data,\n", + " id_vars=['Round', 'Quantization'],\n", + " value_vars=['Average Training Loss', 'Aggregated Model Loss', 'Local Model Loss'],\n", + " var_name='Metric',\n", + " value_name='Loss'\n", + " )\n", + " \n", + " # Plot comparison\n", + " plt.figure(figsize=(15, 8))\n", + " sns.lineplot(\n", + " data=melted_data,\n", + " x='Round',\n", + " y='Loss',\n", + " hue='Quantization',\n", + " style='Metric',\n", + " markers=True,\n", + " dashes=True,\n", + " linewidth=3\n", + " )\n", + " \n", + " plt.title('Comparison of 4-bit vs 8-bit Quantization Performance', fontsize=16, fontweight='bold')\n", + " plt.xlabel('Round', fontsize=14)\n", + " plt.ylabel('Loss', fontsize=14)\n", + " plt.grid(True, linestyle='--', alpha=0.7)\n", + " plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + " plt.tight_layout()\n", + " plt.savefig('aggregated_comparison.png', dpi=300, bbox_inches='tight')\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "61c7da64", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "==== Training & Evaluation Loss Comparison: 4-bit vs 8-bit ====\n", + "\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plot training loss metrics comparison between 4-bit and 8-bit\n", + "print(\"\\n==== Training & Evaluation Loss Comparison: 4-bit vs 8-bit ====\\n\")\n", + "plot_loss_metrics(flflow_4bit, flflow_8bit)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "397cb9c7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "==== Aggregated Performance Metrics: 4-bit vs 8-bit ====\n", + "\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plot aggregated metrics comparison\n", + "print(\"\\n==== Aggregated Performance Metrics: 4-bit vs 8-bit ====\\n\")\n", + "plot_aggregated_metrics(flflow_4bit, flflow_8bit)" + ] + }, + { + "cell_type": "markdown", + "id": "4c089e09", + "metadata": {}, + "source": [ + "### Memory Efficiency Comparison\n", + "\n", + "- **4-bit Quantization**: Uses less memory overall, allowing for larger batch sizes or model sizes on the same hardware.\n", + "- **8-bit Quantization**: Requires more memory but still offers significant savings compared to full precision (FP16/FP32).\n", + "- **Peak Memory Usage**: The difference in peak memory consumption shows the trade-off between precision and memory requirements.\n", + "\n", + "### Training Performance Comparison\n", + "\n", + "- **Training Loss**: 8-bit quantization typically maintains closer fidelity to the original model, potentially leading to slightly better training convergence.\n", + "- **Evaluation Loss**: The evaluation metrics help determine if the higher precision of 8-bit quantization translates to better model performance.\n", + "\n", + "### Use Case Recommendations\n", + "\n", + "- **Resource-constrained environments**: 4-bit quantization provides better memory efficiency for edge devices or limited GPU resources.\n", + "- **Higher precision needs**: If model accuracy is critical and resources permit, 8-bit quantization offers a good balance between performance and efficiency.\n", + "- **Federated Learning Impact**: The quantization choice particularly affects resource utilization across collaborators in the federated setting." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (myenv)", + "language": "python", + "name": "myenv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From f2a2eedf04a3e8be624e35cb7de2319b58214e4d Mon Sep 17 00:00:00 2001 From: Rajith Date: Fri, 16 May 2025 15:45:01 +0530 Subject: [PATCH 30/34] removed older code --- .../LLM/phi-4-with4bit quantization.ipynb | 1772 ----------------- 1 file changed, 1772 deletions(-) delete mode 100644 openfl-tutorials/experimental/workflow/LLM/phi-4-with4bit quantization.ipynb diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-with4bit quantization.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-with4bit quantization.ipynb deleted file mode 100644 index e2efa9054b..0000000000 --- a/openfl-tutorials/experimental/workflow/LLM/phi-4-with4bit quantization.ipynb +++ /dev/null @@ -1,1772 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "a59f475d-d843-46bc-b75e-10984b687ed3", - "metadata": {}, - "source": [ - "# Enhanced Federated Fine-Tuning of Phi-4 Using OpenFL with PEFT & Quantization" - ] - }, - { - "cell_type": "markdown", - "id": "20c74cb9-51a2-42e2-893f-d280e227e8bf", - "metadata": {}, - "source": [ - "In this tutorial, we demonstrate how to fine-tune Microsoft's Phi-4 model in a federated learning workflow with enhanced local training using:\n", - "- Parameter-Efficient Fine-Tuning (PEFT)\n", - "- 4-bit Quantization (QLoRA)\n", - "- Gradient Checkpointing\n", - "- Optimized Training Configuration" - ] - }, - { - "cell_type": "markdown", - "id": "d07c32d3-1a8d-4162-af45-bc3a10e0ae3f", - "metadata": {}, - "source": [ - "## Installation" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "05b2ad75-8c7b-499c-902e-dbd5b24361bc", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install torch transformers peft datasets trl==0.12.2 bitsandbytes accelerate -q" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "a7ae1a7e-8c16-4c5a-be57-33d84723aed7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Thu May 15 13:27:27 2025 \n", - "+-----------------------------------------------------------------------------------------+\n", - "| NVIDIA-SMI 550.144.03 Driver Version: 550.144.03 CUDA Version: 12.4 |\n", - "|-----------------------------------------+------------------------+----------------------+\n", - "| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n", - "| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n", - "| | | MIG M. |\n", - "|=========================================+========================+======================|\n", - "| 0 NVIDIA H100 NVL Off | 00000001:00:00.0 Off | 0 |\n", - "| N/A 39C P0 62W / 400W | 1MiB / 95830MiB | 0% Default |\n", - "| | | Disabled |\n", - "+-----------------------------------------+------------------------+----------------------+\n", - " \n", - "+-----------------------------------------------------------------------------------------+\n", - "| Processes: |\n", - "| GPU GI CI PID Type Process name GPU Memory |\n", - "| ID ID Usage |\n", - "|=========================================================================================|\n", - "| No running processes found |\n", - "+-----------------------------------------------------------------------------------------+\n" - ] - } - ], - "source": [ - "!nvidia-smi" - ] - }, - { - "cell_type": "markdown", - "id": "440a9c39-ec42-45a5-80f6-9a9e0bc90d2f", - "metadata": {}, - "source": [ - "## Import Libraries" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "be4690ae-0671-4d3a-8f21-620ab865a03e", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/azureuser/env_name/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n", - "2025-05-15 13:27:30,648\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n" - ] - } - ], - "source": [ - "# System imports\n", - "import os\n", - "import numpy as np\n", - "\n", - "# PyTorch imports\n", - "import torch\n", - "\n", - "# Hugging Face Transformers imports for model loading and training\n", - "from transformers import (\n", - " AutoModelForCausalLM, # For loading large language models\n", - " AutoTokenizer, # For tokenizing text inputs\n", - " BitsAndBytesConfig, # For 4-bit quantization configuration\n", - " TrainingArguments # For configuring training hyperparameters\n", - ")\n", - "\n", - "# PEFT (Parameter-Efficient Fine-Tuning) imports\n", - "from peft import (\n", - " LoraConfig, # For configuring Low-Rank Adaptation\n", - " get_peft_model, # For applying PEFT to a model\n", - " prepare_model_for_kbit_training, # For preparing quantized models for training\n", - " PeftModel # Base class for PEFT models\n", - ")\n", - "from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict # For state dict manipulation\n", - "\n", - "# Dataset and training imports\n", - "from datasets import load_dataset\n", - "from trl import SFTTrainer # Supervised Fine-Tuning Trainer\n", - "\n", - "# OpenFL imports for federated learning\n", - "from openfl.experimental.workflow.interface import Aggregator, Collaborator, FLSpec\n", - "from openfl.experimental.workflow.placement import aggregator, collaborator\n", - "from openfl.experimental.workflow.runtime import LocalRuntime" - ] - }, - { - "cell_type": "markdown", - "id": "06274755", - "metadata": {}, - "source": [ - "## Acquiring and preprocessing dataset" - ] - }, - { - "cell_type": "markdown", - "id": "a6edefa4", - "metadata": {}, - "source": [ - "We can download the dataset directly from the [LLM-Adapters repository](https://github.com/AGI-Edgerunners/LLM-Adapters)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "962ac825", - "metadata": {}, - "outputs": [], - "source": [ - "# Import libraries needed for downloading and verifying the dataset\n", - "import hashlib\n", - "import requests\n", - "\n", - "def file_checksum(file_path, algorithm=\"sha256\"):\n", - " \"\"\"\n", - " Calculate the checksum of a file using the specified hashing algorithm.\n", - " \n", - " Args:\n", - " file_path (str): The path to the file for which the checksum is to be calculated.\n", - " algorithm (str): The hashing algorithm to use (default is 'sha256').\n", - " \n", - " Returns:\n", - " str: The calculated checksum of the file.\n", - " \"\"\"\n", - " hash_func = hashlib.new(algorithm)\n", - " with open(file_path, \"rb\") as f:\n", - " for chunk in iter(lambda: f.read(4096), b\"\"):\n", - " hash_func.update(chunk)\n", - " return hash_func.hexdigest()\n", - "\n", - "\n", - "# Download the dataset if it doesn't exist locally\n", - "if not os.path.exists(\"math_10k.json\"):\n", - " print(\"Downloading math_10k.json dataset...\")\n", - " r = requests.get(\n", - " \"https://raw.githubusercontent.com/AGI-Edgerunners/LLM-Adapters/main/ft-training_set/math_10k.json\",\n", - " )\n", - " with open(\n", - " \"math_10k.json\",\n", - " \"wb\",\n", - " ) as f:\n", - " f.write(r.content)\n", - " print(\"Download complete.\")\n", - "\n", - " # Verify the integrity of the downloaded file\n", - " actual_checksum = file_checksum(\"math_10k.json\")\n", - " expected_checksum = \"0342d0d860ad8592b579329337c90e42eefd3d9f2898043140cbd120630418b8\"\n", - " if actual_checksum != expected_checksum:\n", - " raise ValueError(\n", - " \"Checksum verification failed. The file may have been altered.\"\n", - " )\n", - " print(\"Checksum verification successful.\")\n", - "else:\n", - " print(\"Dataset already exists locally.\")\n", - "\n", - "# Set the dataset path to be used later\n", - "dataset_name = \"math_10k.json\"" - ] - }, - { - "cell_type": "markdown", - "id": "08576aa0-f628-4ae6-8fc3-dd167d164784", - "metadata": {}, - "source": [ - "## Configuration" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eada9809-468a-47c6-9b03-55aa887c9487", - "metadata": {}, - "outputs": [], - "source": [ - "# Model and dataset configuration\n", - "model_name = \"microsoft/phi-4\" # Pre-trained model identifier from Hugging Face Hub\n", - "#dataset_name = \"math_10k.json\" # Dataset file containing mathematical QA pairs\n", - "\n", - "# QLoRA (Quantized Low-Rank Adaptation) configuration for 4-bit quantization\n", - "# This reduces memory footprint while maintaining model quality\n", - "bnb_config = BitsAndBytesConfig(\n", - " load_in_4bit=True, # Enable 4-bit quantization\n", - " bnb_4bit_quant_type=\"nf4\", # Use normalized float 4 format for better precision\n", - " bnb_4bit_compute_dtype=torch.bfloat16, # Computation precision\n", - " bnb_4bit_use_double_quant=False, # Disable nested quantization for simplicity\n", - ")\n", - "\n", - "# LoRA (Low-Rank Adaptation) configuration for parameter-efficient fine-tuning\n", - "# This allows fine-tuning with significantly fewer parameters\n", - "peft_config = LoraConfig(\n", - " r=8, # Rank of the update matrices (higher = more capacity but more parameters)\n", - " lora_alpha=16, # Scaling factor for the trained weights\n", - " lora_dropout=0.01, # Dropout probability for LoRA layers\n", - " bias=\"none\", # Don't train bias parameters to reduce memory\n", - " task_type=\"CAUSAL_LM\", # Specify causal language modeling task\n", - " target_modules=\"all-linear\", # Apply LoRA to all linear layers\n", - ")\n", - "\n", - "# Training hyperparameters configuration\n", - "training_args = TrainingArguments(\n", - " output_dir=\"./results\", # Directory to save checkpoints and logs\n", - " num_train_epochs=1, # Number of training epochs\n", - " per_device_train_batch_size=2, # Batch size per GPU/TPU core\n", - " gradient_accumulation_steps=2, # Number of updates steps to accumulate before backward pass\n", - " optim=\"adamw_torch_fused\", # Optimizer to use (fused for better performance)\n", - " save_steps=100, # Save checkpoint every X updates steps\n", - " logging_steps=10, # Log metrics every X updates steps\n", - " learning_rate=3e-4, # Initial learning rate\n", - " weight_decay=0.001, # Weight decay regularization\n", - " fp16=False, # Disable FP16 training (using BF16 instead)\n", - " bf16=True, # Enable BF16 training (better numerical stability than FP16)\n", - " max_grad_norm=0.5, # Max gradient norm for gradient clipping\n", - " warmup_ratio=0.02, # Portion of steps for learning rate warmup\n", - " lr_scheduler_type=\"cosine\", # Learning rate scheduler type\n", - " gradient_checkpointing=True, # Enable gradient checkpointing to save memory\n", - " report_to=\"none\" # Disable reporting to tracking platforms\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "ffe93234-2a1a-4809-a431-efe2f35ce496", - "metadata": {}, - "source": [ - "## Load and Prepare Model" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "8ab371f1-64c3-4225-82e7-fb3c5b05578c", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.36it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "trainable params: 27,852,800 || all params: 14,687,360,000 || trainable%: 0.1896\n" - ] - } - ], - "source": [ - "# Load tokenizer\n", - "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n", - "tokenizer.pad_token = tokenizer.eos_token\n", - "tokenizer.padding_side = \"right\"\n", - "\n", - "# Load model with quantization\n", - "model = AutoModelForCausalLM.from_pretrained(\n", - " model_name,\n", - " quantization_config=bnb_config,\n", - " device_map=\"auto\",\n", - " trust_remote_code=True\n", - ")\n", - "\n", - "# Prepare model for k-bit training\n", - "model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)\n", - "\n", - "# Apply LoRA\n", - "model = get_peft_model(model, peft_config)\n", - "model.print_trainable_parameters()" - ] - }, - { - "cell_type": "markdown", - "id": "dd058fff-f6dd-4cc6-acaf-7e2fa2c1132d", - "metadata": {}, - "source": [ - "## Load and Prepare Dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4392ddab-10b7-41f6-a8e0-65ba298ea457", - "metadata": {}, - "outputs": [], - "source": [ - "def format_prompt(example):\n", - " \"\"\"\n", - " Format a dataset example into a standardized prompt-response format for instruction tuning.\n", - " \n", - " This function converts raw dataset examples into a structured format suitable for\n", - " instruction fine-tuning of large language models. The format follows the common\n", - " pattern used for instruction-following tasks with clear section demarcation.\n", - " \n", - " Args:\n", - " example (dict): A dictionary containing the example data with keys:\n", - " - 'instruction': The task instruction\n", - " - 'input': The optional input context (may be empty)\n", - " - 'output': The expected output/response\n", - " \n", - " Returns:\n", - " str: A formatted prompt string with instruction, optional input, and response\n", - " \"\"\"\n", - " if example[\"input\"]:\n", - " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n", - "\n", - "### Instruction:\n", - "{example['instruction']}\n", - "\n", - "### Input:\n", - "{example['input']}\n", - "\n", - "### Response:\n", - "{example['output']}\"\"\"\n", - " else:\n", - " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n", - "\n", - "### Instruction:\n", - "{example['instruction']}\n", - "\n", - "### Response:\n", - "{example['output']}\"\"\"\n", - "\n", - "# Load dataset from JSON file (contains mathematical question-answer pairs)\n", - "dataset = load_dataset(\"json\", data_files=dataset_name, split=\"train\", num_proc=4)\n", - "\n", - "# Transform raw examples into formatted text for instruction tuning\n", - "dataset = dataset.map(lambda x: {\"text\": format_prompt(x)}, num_proc=4)\n", - "\n", - "# Split dataset into training (90%) and evaluation (10%) sets\n", - "dataset = dataset.train_test_split(test_size=0.1)\n", - "train_dataset = dataset[\"train\"]\n", - "eval_dataset = dataset[\"test\"]" - ] - }, - { - "cell_type": "markdown", - "id": "812cfcc8-33ec-4a2b-8a74-27bfc2a41d7b", - "metadata": {}, - "source": [ - "## Enhanced Training with SFTTrainer" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "6dc85c57-68b2-4514-9373-43e3d7c05c10", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': dataset_text_field, max_seq_length, packing. Will not be supported from version '0.13.0'.\n", - "\n", - "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", - " warnings.warn(message, FutureWarning)\n", - "/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "Generating train split: 1820 examples [00:02, 613.71 examples/s]\n", - "Generating train split: 209 examples [00:00, 582.95 examples/s]\n" - ] - } - ], - "source": [ - "trainer = SFTTrainer(\n", - " model=model,\n", - " train_dataset=train_dataset,\n", - " eval_dataset=eval_dataset,\n", - " peft_config=peft_config,\n", - " dataset_text_field=\"text\",\n", - " max_seq_length=1024,\n", - " tokenizer=tokenizer,\n", - " args=training_args,\n", - " packing=True,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "810eb75e", - "metadata": {}, - "source": [ - "## Federated Averaging Function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "58298e8e-ab9e-4377-966e-143823441697", - "metadata": {}, - "outputs": [], - "source": [ - "def FedAvg(peft_params, model, weights=None):\n", - " \"\"\"\n", - " Perform Federated Averaging (FedAvg) on the model parameters.\n", - " \n", - " This function aggregates PEFT parameters from multiple collaborators using weighted\n", - " averaging. It handles the complex task of averaging parameters while maintaining \n", - " the correct tensor types and shapes required by the PEFT framework.\n", - " \n", - " Args:\n", - " peft_params (list): A list of state dictionaries containing PEFT parameters from different collaborators.\n", - " model (torch.nn.Module): The base model to which the averaged parameters will be applied.\n", - " weights (list, optional): A list of weights for averaging the parameters. If None, equal weights are used.\n", - " Weights determine the contribution of each collaborator to the final model.\n", - " \n", - " Returns:\n", - " torch.nn.Module: The model with the averaged parameters applied.\n", - " \n", - " Notes:\n", - " The function converts tensors to float for averaging to avoid precision issues,\n", - " then converts back to the original data type for model compatibility.\n", - " \"\"\"\n", - " # Store the state dictionaries for easy access\n", - " state_dicts = peft_params\n", - " # Get the current state dict from the model as a template\n", - " state_dict = get_peft_model_state_dict(model)\n", - " \n", - " # Iterate through each parameter in the first state dict as reference\n", - " for key in peft_params[0]:\n", - " # Store original data type for later conversion\n", - " dtype = state_dicts[0][key].dtype\n", - " \n", - " # Convert all tensors to float, move to CPU, perform weighted average\n", - " state_dict[key] = torch.from_numpy(\n", - " np.average(\n", - " [state[key].cpu().to(torch.float).numpy() for state in state_dicts], \n", - " axis=0, \n", - " weights=weights\n", - " )\n", - " ).to(dtype) # Convert back to original data type\n", - " \n", - " # Apply the averaged parameters back to the model\n", - " set_peft_model_state_dict(model, state_dict)\n", - " return model" - ] - }, - { - "cell_type": "markdown", - "id": "e120a656-f4a5-47a5-a3d4-62c5f3672bba", - "metadata": {}, - "source": [ - "## Federated Learning Workflow" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e5e108c6-5150-4931-9c01-6b64a913fa04", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Aggregator step \"start\" registered\n", - "Collaborator step \"aggregated_model_validation\" registered\n", - "Collaborator step \"train\" registered\n", - "Collaborator step \"local_model_validation\" registered\n", - "Aggregator step \"join\" registered\n", - "Aggregator step \"end\" registered\n" - ] - } - ], - "source": [ - "# Import the required PrinterCallback for proper initialization/removal\n", - "from transformers.trainer_callback import PrinterCallback\n", - "import transformers\n", - "\n", - "class FederatedFlow(FLSpec):\n", - " \"\"\"\n", - " Federated Learning workflow for fine-tuning Phi-4 model with PEFT and quantization.\n", - " \n", - " This class implements the complete federated learning workflow for a language model,\n", - " including initialization, aggregated model validation, training, local model validation,\n", - " and parameter aggregation. It uses Parameter-Efficient Fine-Tuning (PEFT) with 4-bit\n", - " quantization to efficiently train large language models in memory-constrained environments.\n", - " \n", - " The workflow follows these steps for each round:\n", - " 1. Initialize model on each collaborator\n", - " 2. Validate the aggregated model on local data\n", - " 3. Train the model locally on each collaborator\n", - " 4. Validate the locally trained model\n", - " 5. Aggregate PEFT parameters from all collaborators using FedAvg\n", - " 6. Repeat for specified number of rounds\n", - " \n", - " Attributes:\n", - " model: The base language model being fine-tuned\n", - " peft_params: PEFT parameters dictionary for the model\n", - " optimizer: Optimizer for training (optional)\n", - " rounds: Number of federated learning rounds to perform\n", - " current_round: Counter for the current round\n", - " collaborators: List of collaborators participating in federated learning\n", - " \"\"\"\n", - " def __init__(self, model=None, optimizer=None, rounds=3, **kwargs):\n", - " \"\"\"\n", - " Initialize the federated learning workflow.\n", - " \n", - " Args:\n", - " model: The base language model to fine-tune. Must be provided.\n", - " optimizer: Optional optimizer for model training.\n", - " rounds: Number of federated learning rounds to perform (default: 3).\n", - " **kwargs: Additional arguments passed to the parent class.\n", - " \n", - " Raises:\n", - " ValueError: If no model is provided.\n", - " \"\"\"\n", - " super().__init__(**kwargs)\n", - " if model is not None:\n", - " self.model = model\n", - " self.peft_params = get_peft_model_state_dict(self.model)\n", - " self.optimizer = optimizer\n", - " else:\n", - " raise ValueError(\"No model inputted\")\n", - "\n", - " self.rounds = rounds\n", - " \n", - "\n", - " @aggregator\n", - " def start(self):\n", - " \"\"\"\n", - " Start the federated learning process on the aggregator.\n", - " \n", - " This method initializes the workflow by:\n", - " 1. Setting up the list of collaborators from the runtime\n", - " 2. Initializing the current round counter\n", - " 3. Starting the first step of the workflow by sending the model\n", - " to all collaborators for validation\n", - " \n", - " The @aggregator decorator ensures this method runs on the aggregator node.\n", - " \"\"\"\n", - " print(f\"Performing initialization for model\")\n", - " self.collaborators = self.runtime.collaborators\n", - " self.current_round = 0\n", - " # Start the workflow by sending the model to all collaborators\n", - " self.next(\n", - " self.aggregated_model_validation,\n", - " foreach=\"collaborators\",\n", - " )\n", - "\n", - " \n", - " @collaborator\n", - " def aggregated_model_validation(self):\n", - " \"\"\"\n", - " Validate the aggregated model on each collaborator's local dataset.\n", - " \n", - " This method:\n", - " 1. Loads the model with appropriate quantization configuration\n", - " 2. Applies the PEFT configuration and parameters\n", - " 3. Creates a trainer with local validation dataset\n", - " 4. Evaluates the model and records the validation loss\n", - " 5. Transitions to the training phase\n", - " \n", - " The @collaborator decorator ensures this method runs on each collaborator node.\n", - " \n", - " Notes:\n", - " Includes fallback to CPU if GPU memory is insufficient\n", - " \"\"\"\n", - " print(f\"Performing aggregated model validation for collaborator {self.input}\")\n", - " # Load model with quantization and CPU offloading if needed\n", - " device_map = \"auto\" \n", - " try:\n", - " # Try to load model on GPU with quantization\n", - " self.model = AutoModelForCausalLM.from_pretrained(\n", - " model_name,\n", - " quantization_config=bnb_config,\n", - " device_map=device_map,\n", - " #max_memory={0: \"4GiB\", \"cpu\": \"24GiB\"},\n", - " trust_remote_code=True\n", - " )\n", - " except ValueError:\n", - " # Fallback to CPU if GPU memory is insufficient\n", - " print(f\"Falling back to CPU mode for {self.input}\")\n", - " self.model = AutoModelForCausalLM.from_pretrained(\n", - " model_name,\n", - " device_map=\"cpu\",\n", - " trust_remote_code=True\n", - " )\n", - " \n", - " # Prepare model for training with quantization\n", - " self.model = prepare_model_for_kbit_training(self.model)\n", - " # Apply PEFT configuration (LoRA)\n", - " self.model = get_peft_model(self.model, peft_config)\n", - " # Load aggregated parameters\n", - " set_peft_model_state_dict(self.model, self.peft_params)\n", - " \n", - " # Setup trainer for evaluation\n", - " trainer = SFTTrainer(\n", - " model=self.model,\n", - " args=training_args,\n", - " peft_config=peft_config,\n", - " train_dataset=self.train_dataset,\n", - " eval_dataset=self.eval_dataset,\n", - " max_seq_length=1024,\n", - " dataset_text_field=\"text\",\n", - " tokenizer=tokenizer,\n", - " packing=True,\n", - " data_collator=transformers.DataCollatorForSeq2Seq(\n", - " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n", - " ),\n", - " )\n", - "\n", - " # Remove default printer callback to avoid verbose output\n", - " trainer.remove_callback(PrinterCallback)\n", - " # Evaluate model and store metrics\n", - " out = trainer.evaluate()\n", - " self.agg_validation_score = out[\"eval_loss\"]\n", - " print(f\"{self.input} value of {self.agg_validation_score}\")\n", - " # Move to training phase\n", - " self.next(self.train)\n", - "\n", - " @collaborator\n", - " def train(self):\n", - " \"\"\"\n", - " Train the model on each collaborator's local dataset.\n", - " \n", - " This method:\n", - " 1. Creates an SFTTrainer with the local training dataset\n", - " 2. Runs the training process\n", - " 3. Records the training loss\n", - " 4. Saves the trained model\n", - " 5. Transitions to local validation phase\n", - " \n", - " The @collaborator decorator ensures this method runs on each collaborator node.\n", - " \"\"\"\n", - " # Setup trainer for local training\n", - " trainer = SFTTrainer(\n", - " model=self.model,\n", - " args=training_args,\n", - " peft_config=peft_config,\n", - " train_dataset=self.train_dataset,\n", - " eval_dataset=self.eval_dataset,\n", - " max_seq_length=1024,\n", - " dataset_text_field=\"text\",\n", - " tokenizer=tokenizer,\n", - " packing=True,\n", - " data_collator=transformers.DataCollatorForSeq2Seq(\n", - " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n", - " ),\n", - " )\n", - "\n", - " # Execute training\n", - " out = trainer.train()\n", - " # Store training loss for later analysis\n", - " self.loss = out.training_loss\n", - " # Save locally trained model\n", - " trainer.save_model()\n", - " self.training_completed = True\n", - " # Move to local validation phase\n", - " self.next(self.local_model_validation)\n", - "\n", - " @collaborator\n", - " def local_model_validation(self):\n", - " \"\"\"\n", - " Validate the locally trained model on each collaborator's validation dataset.\n", - " \n", - " This method:\n", - " 1. Creates an SFTTrainer with the local validation dataset\n", - " 2. Evaluates the locally trained model\n", - " 3. Records the validation loss\n", - " 4. Extracts the PEFT parameters for aggregation\n", - " 5. Sends results to the aggregator for parameter aggregation\n", - " \n", - " The @collaborator decorator ensures this method runs on each collaborator node.\n", - " \n", - " Notes:\n", - " Excludes the full model and training flags from the data sent to the aggregator\n", - " to reduce communication overhead\n", - " \"\"\"\n", - " # Setup trainer for evaluation\n", - " trainer = SFTTrainer(\n", - " model=self.model,\n", - " args=training_args,\n", - " peft_config=peft_config,\n", - " train_dataset=self.train_dataset,\n", - " eval_dataset=self.eval_dataset,\n", - " max_seq_length=1024,\n", - " dataset_text_field=\"text\",\n", - " tokenizer=tokenizer,\n", - " packing=True,\n", - " data_collator=transformers.DataCollatorForSeq2Seq(\n", - " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n", - " ),\n", - " )\n", - " # Evaluate the locally trained model\n", - " out = trainer.evaluate()\n", - " self.local_validation_score = out[\"eval_loss\"]\n", - " # Extract PEFT parameters for aggregation\n", - " self.peft_params = get_peft_model_state_dict(self.model)\n", - " print(f\"Doing local model validation for collaborator {self.input}\")\n", - " # Send results to aggregator, excluding the full model and training flags\n", - " # to reduce communication overhead\n", - " self.next(self.join, exclude=[\"training_completed\", \"model\"])\n", - "\n", - " @aggregator\n", - " def join(self, inputs):\n", - " \"\"\"\n", - " Aggregate results from all collaborators and update the global model.\n", - " \n", - " This method:\n", - " 1. Calculates average loss, aggregated model accuracy, and local model accuracy\n", - " 2. Updates the global model using Federated Averaging (FedAvg)\n", - " 3. Saves the aggregated model and tokenizer\n", - " 4. Either starts the next round or ends the workflow depending on round count\n", - " \n", - " Args:\n", - " inputs: List of data objects from all collaborators containing validation scores\n", - " and PEFT parameters.\n", - " \n", - " The @aggregator decorator ensures this method runs on the aggregator node.\n", - " \"\"\"\n", - " # Calculate average metrics across all collaborators\n", - " self.average_loss = sum(input.loss for input in inputs) / len(inputs)\n", - " self.aggregated_model_accuracy = sum(\n", - " input.agg_validation_score for input in inputs\n", - " ) / len(inputs)\n", - " self.local_model_accuracy = sum(\n", - " input.local_validation_score for input in inputs\n", - " ) / len(inputs)\n", - " \n", - " # Display aggregated metrics\n", - " print(\n", - " f\"Average aggregated model validation values = {self.aggregated_model_accuracy}\"\n", - " )\n", - " print(f\"Average training loss = {self.average_loss}\")\n", - " print(f\"Average local model validation values = {self.local_model_accuracy}\")\n", - "\n", - " # Perform federated averaging of model parameters\n", - " self.model = FedAvg([input.peft_params for input in inputs], self.model)\n", - " self.peft_params = get_peft_model_state_dict(self.model)\n", - "\n", - " # Save the aggregated model for future use\n", - " self.model.save_pretrained(\"./aggregated/model\")\n", - " tokenizer.save_pretrained(\"./aggregated/tokenizer\")\n", - " \n", - " # Increment round counter and start next round or end workflow\n", - " self.current_round += 1\n", - " if self.current_round < self.rounds:\n", - " self.next(\n", - " self.aggregated_model_validation,\n", - " foreach=\"collaborators\",\n", - " exclude=[\"model\"],\n", - " )\n", - " else:\n", - " self.next(self.end)\n", - "\n", - " @aggregator\n", - " def end(self):\n", - " \"\"\"\n", - " End the federated learning process.\n", - " \n", - " This method marks the end of the federated learning workflow after all rounds\n", - " have been completed. The final aggregated model and tokenizer are already saved\n", - " in the last join step.\n", - " \n", - " The @aggregator decorator ensures this method runs on the aggregator node.\n", - " \"\"\"\n", - " print(f\"This is the end of the flow\")" - ] - }, - { - "cell_type": "markdown", - "id": "7bc8fe27", - "metadata": {}, - "source": [ - "## Run Federated Learning" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "38894111-41d9-4dd4-b1c8-eb7ec3cdd3e1", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Calling start\n", - "\u001b[94mPerforming initialization for model\u001b[0m\u001b[94m\n", - "\u001b[0m\n", - "Calling aggregated_model_validation\n", - "\u001b[94mPerforming aggregated model validation for collaborator Portland\u001b[0m\u001b[94m\n", - "\u001b[0m" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.30it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", - "\n", - "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", - " warnings.warn(message, FutureWarning)\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "Generating train split: 913 examples [00:01, 623.08 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", - "Generating train split: 104 examples [00:00, 583.62 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", - "\u001b[0m" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - " \n", - " \n", - " [13/13 00:12]\n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[94mPortland value of 0.5918120741844177\u001b[0m\u001b[94m\n", - "\u001b[0m\n", - "Calling train\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", - "\n", - "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", - " warnings.warn(message, FutureWarning)\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "Generating train split: 913 examples [00:01, 616.37 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", - "Generating train split: 104 examples [00:00, 615.85 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", - "\u001b[0m`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - " \n", - " \n", - " [228/228 08:54, Epoch 0/1]\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
StepTraining Loss
100.516900
200.373400
300.346100
400.339100
500.333000
600.323700
700.329800
800.312800
900.326000
1000.306800
1100.314900
1200.328300
1300.311300
1400.313400
1500.315400
1600.312200
1700.303000
1800.307400
1900.307600
2000.312500
2100.307000
2200.308000

" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Calling local_model_validation\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", - "\n", - "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", - " warnings.warn(message, FutureWarning)\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m" - ] - }, - { - "data": { - "text/html": [ - "\n", - "

\n", - " \n", - " \n", - " [13/13 00:13]\n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", - "\u001b[0mShould transfer from local_model_validation to join\n", - "\n", - "Calling aggregated_model_validation\n", - "\u001b[94mPerforming aggregated model validation for collaborator Seattle\u001b[0m\u001b[94m\n", - "\u001b[0m" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.32it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", - "\n", - "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", - " warnings.warn(message, FutureWarning)\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "Generating train split: 907 examples [00:01, 626.09 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", - "Generating train split: 105 examples [00:00, 634.41 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", - "\u001b[0m" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - " \n", - " \n", - " [14/14 00:13]\n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[94mSeattle value of 0.589488685131073\u001b[0m\u001b[94m\n", - "\u001b[0m\n", - "Calling train\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", - "\n", - "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", - " warnings.warn(message, FutureWarning)\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - " \n", - " \n", - " [227/227 08:53, Epoch 1/1]\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
StepTraining Loss
100.500700
200.392300
300.364500
400.327800
500.342000
600.310900
700.318500
800.317900
900.333300
1000.321300
1100.312500
1200.301500
1300.314000
1400.317100
1500.316800
1600.318300
1700.318000
1800.295800
1900.311000
2000.310900
2100.327200
2200.311600

" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Calling local_model_validation\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", - "\n", - "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", - " warnings.warn(message, FutureWarning)\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m" - ] - }, - { - "data": { - "text/html": [ - "\n", - "

\n", - " \n", - " \n", - " [14/14 00:13]\n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", - "\u001b[0mShould transfer from local_model_validation to join\n", - "\n", - "Calling join\n", - "\u001b[94mAverage aggregated model validation values = 0.5906503796577454\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94mAverage training loss = 0.3295206361469617\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94mAverage local model validation values = 0.3146952837705612\u001b[0m\u001b[94m\n", - "\u001b[0m\n", - "Calling aggregated_model_validation\n", - "\u001b[94mPerforming aggregated model validation for collaborator Portland\u001b[0m\u001b[94m\n", - "\u001b[0m" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.33it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", - "\n", - "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", - " warnings.warn(message, FutureWarning)\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - " \n", - " \n", - " [13/13 00:13]\n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[94mPortland value of 0.31504756212234497\u001b[0m\u001b[94m\n", - "\u001b[0m\n", - "Calling train\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", - "\n", - "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", - " warnings.warn(message, FutureWarning)\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - " \n", - " \n", - " [228/228 08:57, Epoch 0/1]\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
StepTraining Loss
100.314000
200.292000
300.287100
400.288900
500.283600
600.281300
700.290200
800.277900
900.291600
1000.278400
1100.285700
1200.302800
1300.291500
1400.295600
1500.299100
1600.298400
1700.291200
1800.297700
1900.298500
2000.305400
2100.301000
2200.303100

" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Calling local_model_validation\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", - "\n", - "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", - " warnings.warn(message, FutureWarning)\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m" - ] - }, - { - "data": { - "text/html": [ - "\n", - "

\n", - " \n", - " \n", - " [13/13 00:13]\n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", - "\u001b[0mShould transfer from local_model_validation to join\n", - "\n", - "Calling aggregated_model_validation\n", - "\u001b[94mPerforming aggregated model validation for collaborator Seattle\u001b[0m\u001b[94m\n", - "\u001b[0m" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.30it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", - "\n", - "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", - " warnings.warn(message, FutureWarning)\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - " \n", - " \n", - " [14/14 00:13]\n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[94mSeattle value of 0.31057578325271606\u001b[0m\u001b[94m\n", - "\u001b[0m\n", - "Calling train\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", - "\n", - "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", - " warnings.warn(message, FutureWarning)\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - " \n", - " \n", - " [227/227 08:50, Epoch 1.00/1]\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
StepTraining Loss
100.300900
200.307900
300.303400
400.274300
500.295200
600.270700
700.280300
800.284600
900.298700
1000.290900
1100.284300
1200.277500
1300.292400
1400.299300
1500.298800
1600.305300
1700.304600
1800.286900
1900.302600
2000.305300
2100.320000
2200.306800

" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Setup federated learning participants\n", - "aggregator = Aggregator() # Central coordinator that aggregates model updates\n", - "collaborators = [\n", - " Collaborator(name=\"Portland\"), # First participant with local dataset\n", - " Collaborator(name=\"Seattle\") # Second participant with local dataset\n", - "]\n", - "\n", - "# Distribute data shards to collaborators (simulating data silos)\n", - "# Each collaborator gets a non-overlapping portion of the dataset\n", - "for idx, colab in enumerate(collaborators):\n", - " colab.private_attributes = {\n", - " \"train_dataset\": train_dataset.shard(len(collaborators), idx), # Training shard\n", - " \"eval_dataset\": eval_dataset.shard(len(collaborators), idx) # Evaluation shard\n", - " }\n", - "\n", - "# Set up and execute the federated learning workflow\n", - "runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators) # Local simulation runtime\n", - "flflow = FederatedFlow(model, rounds=2) # Create flow with 2 federated learning rounds\n", - "flflow.runtime = runtime # Assign runtime to the flow\n", - "flflow.run() # Start the federated learning process" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python (myenv)", - "language": "python", - "name": "myenv" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 46dba98a94edbb7a6b9b82dedf9d6b89d5dcf707 Mon Sep 17 00:00:00 2001 From: Rajith Date: Wed, 21 May 2025 16:11:00 +0530 Subject: [PATCH 31/34] resolving comments --- .../workflow/LLM/phi-4-withquantization.ipynb | 12068 +++++++++++++--- openfl/utilities/phi_utils.py | 434 + 2 files changed, 10433 insertions(+), 2069 deletions(-) create mode 100644 openfl/utilities/phi_utils.py diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-withquantization.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-withquantization.ipynb index 8a33373f91..5e592b5873 100644 --- a/openfl-tutorials/experimental/workflow/LLM/phi-4-withquantization.ipynb +++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-withquantization.ipynb @@ -5,7 +5,7 @@ "id": "a59f475d-d843-46bc-b75e-10984b687ed3", "metadata": {}, "source": [ - "# Enhanced Federated Fine-Tuning of Phi-4 Using OpenFL with PEFT & Quantization" + "# LLM Federated Finetuning with PEFT and Quantization" ] }, { @@ -13,11 +13,40 @@ "id": "20c74cb9-51a2-42e2-893f-d280e227e8bf", "metadata": {}, "source": [ - "In this tutorial, we demonstrate how to fine-tune Microsoft's Phi-4 model in a federated learning workflow with enhanced local training using:\n", - "- Parameter-Efficient Fine-Tuning (PEFT)\n", - "- 4-bit Quantization (QLoRA)\n", - "- Gradient Checkpointing\n", - "- Optimized Training Configuration" + "## Overview\n", + "\n", + "This tutorial demonstrates how to efficiently fine-tune Microsoft's Phi-4 model (7B parameter variant) in a federated learning workflow using OpenFL framework with advanced techniques for memory optimization and performance enhancement. The approach combines:\n", + "\n", + "### Memory Optimization Techniques\n", + "- **Parameter-Efficient Fine-Tuning (PEFT)**: Using Low-Rank Adaptation (LoRA) to fine-tune only a small subset of model parameters\n", + "- **Quantization**: Comparing 4-bit (NF4) and 8-bit quantization approaches with QLoRA to reduce memory footprint\n", + "- **Gradient Checkpointing**: Trading computation for memory by recomputing activations during backpropagation\n", + "\n", + "### Training Enhancements\n", + "- **Partial Round Updates**: Breaking each global round into partial updates for more frequent knowledge sharing\n", + "- **Fixed Training Steps**: Using a fixed number of training steps (100) equivalent to one epoch\n", + "- **Optimizer State Preservation**: Maintaining optimizer momentum across federation rounds\n", + "- **Memory Usage Tracking**: Detailed monitoring of GPU/CPU memory consumption across training phases\n", + "\n", + "### Federated Learning Architecture\n", + "- **Server-Client Model**: Central aggregator and multiple collaborators (simulated locally)\n", + "- **Federated Averaging**: Weighted parameter averaging between collaborator models\n", + "- **Metrics Visualization**: Tracking and comparing training loss, validation loss, and memory usage\n", + "\n", + "The tutorial implements a complete workflow that addresses common challenges in federated fine-tuning of large language models, including memory constraints, training efficiency, and performance metrics tracking across heterogeneous clients." + ] + }, + { + "cell_type": "markdown", + "id": "7241cc9a", + "metadata": {}, + "source": [ + "Before running the notebook make sure to install NVIDIA drivers using the below command\n", + "```\n", + "sudo apt update \n", + "sudo apt install -y nvidia-driver-550 \n", + "sudo reboot\n", + "```" ] }, { @@ -48,7 +77,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Fri May 16 07:23:10 2025 \n", + "Wed May 21 08:21:44 2025 \n", "+-----------------------------------------------------------------------------------------+\n", "| NVIDIA-SMI 550.144.03 Driver Version: 550.144.03 CUDA Version: 12.4 |\n", "|-----------------------------------------+------------------------+----------------------+\n", @@ -57,7 +86,7 @@ "| | | MIG M. |\n", "|=========================================+========================+======================|\n", "| 0 NVIDIA H100 NVL Off | 00000001:00:00.0 Off | 0 |\n", - "| N/A 41C P0 66W / 400W | 1MiB / 95830MiB | 0% Default |\n", + "| N/A 31C P0 61W / 400W | 1MiB / 95830MiB | 0% Default |\n", "| | | Disabled |\n", "+-----------------------------------------+------------------------+----------------------+\n", " \n", @@ -86,6 +115,16 @@ { "cell_type": "code", "execution_count": 3, + "id": "a3e6c3f4-dec3-4d3a-97cb-5b35bec06046", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install seaborn matplotlib pandas -q" + ] + }, + { + "cell_type": "code", + "execution_count": 4, "id": "be4690ae-0671-4d3a-8f21-620ab865a03e", "metadata": {}, "outputs": [ @@ -95,7 +134,7 @@ "text": [ "/home/azureuser/env_name/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", - "2025-05-16 07:23:13,756\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n" + "2025-05-21 08:21:48,953\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n" ] } ], @@ -124,19 +163,28 @@ "from transformers.trainer_callback import PrinterCallback\n", "import transformers\n", "import gc\n", - "import psutil" + "import psutil\n", + "\n", + "# Import our utility functions\n", + "from openfl.utilities.phi_utils import (\n", + " get_gpu_memory_info,\n", + " MemoryTracker,\n", + " plot_memory_metrics,\n", + " plot_loss_metrics,\n", + " plot_aggregated_metrics\n", + ")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "74fed8f2", "metadata": {}, "outputs": [], "source": [ "# Memory optimization setup\n", - "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\" # Enable dynamic memory allocation\n", - "os.environ[\"TRANSFORMERS_ATTN_IMPLEMENTATION\"] = \"flash_attention_2\" # Use optimized attention\n", + "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"\n", + "os.environ[\"TRANSFORMERS_ATTN_IMPLEMENTATION\"] = \"flash_attention_2\"\n", "\n", "def clear_gpu():\n", " torch.cuda.empty_cache()\n", @@ -147,20 +195,27 @@ }, { "cell_type": "markdown", - "id": "813b4917", + "id": "8fa0941e-5fd7-401b-9cc7-0beb5a2a3621", "metadata": {}, "source": [ "## Acquiring and preprocessing dataset\n", - "\n", "We can download the dataset directly from the [LLM-Adapters repository](https://github.com/AGI-Edgerunners/LLM-Adapters)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "6df7bfb4", + "execution_count": 6, + "id": "a50ae4a4-628d-4f45-a9fc-c5c437df229e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset already exists locally.\n" + ] + } + ], "source": [ "# Import libraries needed for downloading and verifying the dataset\n", "import hashlib\n", @@ -222,24 +277,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "eada9809-468a-47c6-9b03-55aa887c9487", "metadata": {}, "outputs": [], "source": [ "# Model and dataset\n", "model_name = \"microsoft/phi-4\"\n", - "#dataset_name = \"math_10k.json\"\n", + "dataset_name = \"math_10k.json\"\n", "\n", "# 4-bit QLoRA configuration\n", "bnb_config_4bit = BitsAndBytesConfig(\n", " load_in_4bit=True,\n", " bnb_4bit_quant_type=\"nf4\",\n", " bnb_4bit_compute_dtype=torch.bfloat16,\n", - " bnb_4bit_use_double_quant=False,\n", + " bnb_4bit_use_double_quant=True, # Enable double quantization for further memory saving\n", ")\n", "\n", - "# 8-bit QLoRA configuration\n", + "# 8-bit QLoRA configuration with more aggressive memory savings\n", "bnb_config_8bit = BitsAndBytesConfig(\n", " load_in_8bit=True,\n", " llm_int8_enable_fp32_cpu_offload=True,\n", @@ -251,9 +306,9 @@ "# Active quantization config (will be set to either 4-bit or 8-bit)\n", "bnb_config = bnb_config_4bit # Default to 4-bit\n", "\n", - "# LoRA configuration\n", + "# LoRA configuration - reduce parameters to save memory\n", "peft_config = LoraConfig(\n", - " r=8, # Increased from original for better adaptation\n", + " r=4, # Reduced from 8 to save memory\n", " lora_alpha=16,\n", " lora_dropout=0.01,\n", " bias=\"none\",\n", @@ -261,15 +316,18 @@ " target_modules=\"all-linear\",\n", ")\n", "\n", - "# Training configuration\n", + "# Training configuration with memory optimizations\n", "training_args = TrainingArguments(\n", " output_dir=\"./results\",\n", - " num_train_epochs=1,\n", - " per_device_train_batch_size=2, # Reduced for Phi-4\n", - " gradient_accumulation_steps=2,\n", + " # Reduced steps for testing\n", + " max_steps=50, # Reduced from 100 to 50 to save memory\n", + " per_device_train_batch_size=1,\n", + " per_device_eval_batch_size=1,\n", + " gradient_accumulation_steps=8, # Increased from 4 to 8 to reduce memory pressure\n", " optim=\"adamw_torch_fused\",\n", - " save_steps=100,\n", - " logging_steps=10,\n", + " # More frequent saving and logging\n", + " save_steps=25,\n", + " logging_steps=5,\n", " learning_rate=3e-4,\n", " weight_decay=0.001,\n", " fp16=False,\n", @@ -278,7 +336,12 @@ " warmup_ratio=0.02,\n", " lr_scheduler_type=\"cosine\",\n", " gradient_checkpointing=True,\n", - " report_to=\"none\"\n", + " report_to=\"none\",\n", + " # Enable memory optimization options\n", + " deepspeed=None, # Not using DeepSpeed but enabling other memory optimizations\n", + " optim_target_modules=[\"c_attn\", \"c_proj\"], # Optimize specific modules\n", + " # Add auto memory optimization flag\n", + " auto_find_batch_size=True # Automatically find the largest batch size that fits in memory\n", ")" ] }, @@ -292,7 +355,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "id": "8ab371f1-64c3-4225-82e7-fb3c5b05578c", "metadata": {}, "outputs": [ @@ -300,14 +363,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Loading checkpoint shards: 100%|██████████| 6/6 [00:04<00:00, 1.33it/s]\n" + "Loading checkpoint shards: 100%|██████████| 6/6 [00:04<00:00, 1.31it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "trainable params: 27,852,800 || all params: 14,687,360,000 || trainable%: 0.1896\n" + "trainable params: 13,926,400 || all params: 14,673,433,600 || trainable%: 0.0949\n" ] } ], @@ -343,28 +406,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "4392ddab-10b7-41f6-a8e0-65ba298ea457", "metadata": {}, "outputs": [], "source": [ "def format_prompt(example):\n", - " \"\"\"\n", - " Format the input example into a standardized prompt structure for training.\n", - " \n", - " This function creates a consistent instruction-response format for the LLM training:\n", - " - Includes instruction, optional input, and expected output\n", - " - Using a standardized template inspired by instruction-tuning datasets\n", - " \n", - " Parameters:\n", - " example (dict): Dictionary containing 'instruction', 'input' (optional), and 'output' fields\n", - " \n", - " Returns:\n", - " str: Formatted prompt with consistent structure for model training and evaluation\n", - " \"\"\"\n", - " # Handle case where input is provided\n", " if example[\"input\"]:\n", - " # Format with both instruction and input fields\n", " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n", "\n", "### Instruction:\n", @@ -376,7 +424,6 @@ "### Response:\n", "{example['output']}\"\"\"\n", " else:\n", - " # Format with only instruction (no input field)\n", " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n", "\n", "### Instruction:\n", @@ -387,10 +434,9 @@ "\n", "# Load dataset\n", "dataset = load_dataset(\"json\", data_files=dataset_name, split=\"train\", num_proc=4)\n", - "# Apply formatting to each example in the dataset\n", "dataset = dataset.map(lambda x: {\"text\": format_prompt(x)}, num_proc=4)\n", "\n", - "# Split dataset into training and evaluation subsets\n", + "# Split dataset\n", "dataset = dataset.train_test_split(test_size=0.1)\n", "train_dataset = dataset[\"train\"]\n", "eval_dataset = dataset[\"test\"]" @@ -406,7 +452,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "id": "6dc85c57-68b2-4514-9373-43e3d7c05c10", "metadata": {}, "outputs": [ @@ -414,32 +460,31 @@ "name": "stderr", "output_type": "stream", "text": [ - "/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': dataset_text_field, max_seq_length, packing. Will not be supported from version '0.13.0'.\n", + "/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': dataset_text_field, max_seq_length. Will not be supported from version '0.13.0'.\n", "\n", "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", " warnings.warn(message, FutureWarning)\n", - "/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", "/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", "/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", - "Generating train split: 1832 examples [00:02, 615.45 examples/s]\n", - "Generating train split: 197 examples [00:00, 575.95 examples/s]\n" + "Map: 100%|██████████| 8927/8927 [00:02<00:00, 3310.91 examples/s]\n", + "Map: 100%|██████████| 992/992 [00:00<00:00, 3267.10 examples/s]\n", + "max_steps is given, it will override any value given in num_train_epochs\n" ] } ], "source": [ + "# Create SFTTrainer\n", "trainer = SFTTrainer(\n", " model=model,\n", " train_dataset=train_dataset,\n", " eval_dataset=eval_dataset,\n", " peft_config=peft_config,\n", " dataset_text_field=\"text\",\n", - " max_seq_length=1024,\n", + " max_seq_length=512, # Reduced from 1024 to save memory\n", " tokenizer=tokenizer,\n", - " args=training_args,\n", - " packing=True,\n", + " args=training_args\n", ")" ] }, @@ -453,7 +498,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "id": "58298e8e-ab9e-4377-966e-143823441697", "metadata": {}, "outputs": [], @@ -493,7 +538,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "e5e108c6-5150-4931-9c01-6b64a913fa04", "metadata": {}, "outputs": [ @@ -516,268 +561,14 @@ "import transformers\n", "import gc\n", "import psutil\n", - "\n", - "def get_gpu_memory_info():\n", - " \"\"\"Get detailed GPU memory usage information in megabytes.\n", - " \n", - " This function checks for CUDA availability and returns a dictionary with memory allocation\n", - " information including allocated, reserved, and maximum allocated GPU memory.\n", - " \n", - " Returns:\n", - " dict: Dictionary with memory usage information in MB:\n", - " - allocated: Currently allocated memory by PyTorch tensors\n", - " - reserved: Total memory reserved by PyTorch\n", - " - max_allocated: Maximum allocated memory since last reset\n", - " \n", - " Note:\n", - " Returns zeros for all metrics if CUDA is not available or if an error occurs.\n", - " \"\"\"\n", - " try:\n", - " if torch.cuda.is_available():\n", - " allocated = torch.cuda.memory_allocated() / (1024 * 1024)\n", - " reserved = torch.cuda.memory_reserved() / (1024 * 1024)\n", - " max_allocated = torch.cuda.max_memory_allocated() / (1024 * 1024)\n", - " return {\n", - " \"allocated\": allocated,\n", - " \"reserved\": reserved,\n", - " \"max_allocated\": max_allocated\n", - " }\n", - " else:\n", - " return {\"allocated\": 0, \"reserved\": 0, \"max_allocated\": 0}\n", - " except:\n", - " return {\"allocated\": 0, \"reserved\": 0, \"max_allocated\": 0}\n", - "\n", - "class MemoryTracker:\n", - " \"\"\"Track GPU memory usage during training\"\"\"\n", - " def __init__(self, collaborator_name, quant_type):\n", - " self.collaborator_name = collaborator_name\n", - " self.quant_type = quant_type\n", - " self.timestamps = {}\n", - " self.peak = {\"allocated\": 0, \"reserved\": 0, \"max_allocated\": 0}\n", - " self.training_loss = None\n", - " self.eval_loss = None\n", - " \n", - " def log(self, timestamp):\n", - " \"\"\"Log current memory usage at a specific timestamp\"\"\"\n", - " self.timestamps[timestamp] = get_gpu_memory_info()\n", - " \n", - " def log_loss(self, training_loss=None, eval_loss=None):\n", - " \"\"\"Log training or evaluation loss\"\"\"\n", - " if training_loss is not None:\n", - " self.training_loss = training_loss\n", - " if eval_loss is not None:\n", - " self.eval_loss = eval_loss\n", - " \n", - " def update_peak(self):\n", - " \"\"\"Update peak memory usage values\"\"\"\n", - " current = get_gpu_memory_info()\n", - " self.peak[\"allocated\"] = max(self.peak[\"allocated\"], current[\"allocated\"])\n", - " self.peak[\"reserved\"] = max(self.peak[\"reserved\"], current[\"reserved\"])\n", - " self.peak[\"max_allocated\"] = max(self.peak[\"max_allocated\"], current[\"max_allocated\"])\n", - " \n", - " def reset_peak(self):\n", - " \"\"\"Reset peak memory usage values\"\"\"\n", - " self.peak = {\"allocated\": 0, \"reserved\": 0, \"max_allocated\": 0}\n", - " \n", - " def report(self):\n", - " \"\"\"Print memory usage report\"\"\"\n", - " print(f\"\\n==== Memory Usage Report for {self.collaborator_name} ({self.quant_type}) ====\")\n", - " print(f\"Peak Memory Usage:\")\n", - " print(f\" Allocated: {self.peak['allocated']:.2f} MB\")\n", - " print(f\" Reserved: {self.peak['reserved']:.2f} MB\")\n", - " print(f\" Max Allocated: {self.peak['max_allocated']:.2f} MB\")\n", - " \n", - " print(\"\\nMemory Usage by Stage:\")\n", - " for timestamp, mem in self.timestamps.items():\n", - " print(f\" {timestamp}:\")\n", - " print(f\" Allocated: {mem['allocated']:.2f} MB\")\n", - " print(f\" Reserved: {mem['reserved']:.2f} MB\")\n", - " print(f\" Max Allocated: {mem['max_allocated']:.2f} MB\")\n", - " \n", - " print(\"\\nPerformance Metrics:\")\n", - " if self.training_loss is not None:\n", - " print(f\" Training Loss: {self.training_loss:.4f}\")\n", - " if self.eval_loss is not None:\n", - " print(f\" Evaluation Loss: {self.eval_loss:.4f}\")\n", - " print(\"-\" * 50)\n", - " \n", - " def get_stats(self):\n", - " \"\"\"Get all statistics as a dictionary\"\"\"\n", - " stats = {\n", - " \"peak_allocated\": self.peak[\"allocated\"],\n", - " \"peak_reserved\": self.peak[\"reserved\"],\n", - " \"peak_max_allocated\": self.peak[\"max_allocated\"],\n", - " \"quant_type\": self.quant_type,\n", - " \"training_loss\": self.training_loss,\n", - " \"eval_loss\": self.eval_loss\n", - " }\n", - " for timestamp, mem in self.timestamps.items():\n", - " stats[f\"{timestamp}_allocated\"] = mem[\"allocated\"]\n", - " stats[f\"{timestamp}_reserved\"] = mem[\"reserved\"]\n", - " stats[f\"{timestamp}_max_allocated\"] = mem[\"max_allocated\"]\n", - " return stats\n", - "\n", - "def plot_memory_metrics(flow_4bit, flow_8bit):\n", - " \"\"\"Plot and compare memory metrics between 4-bit and 8-bit quantization.\"\"\"\n", - " try:\n", - " import matplotlib.pyplot as plt\n", - " import pandas as pd\n", - " from matplotlib.ticker import EngFormatter\n", - "\n", - " # Create figure with multiple subplots\n", - " fig, axs = plt.subplots(2, 2, figsize=(16, 12))\n", - " fig.suptitle('4-bit vs 8-bit Quantization Comparison', fontsize=16)\n", - " \n", - " # Colors for consistent plotting\n", - " colors_4bit = {'Portland': 'blue', 'Seattle': 'green'}\n", - " colors_8bit = {'Portland': 'darkblue', 'Seattle': 'darkgreen'}\n", - " markers_4bit = {'Portland': 'o', 'Seattle': 's'}\n", - " markers_8bit = {'Portland': '^', 'Seattle': 'D'}\n", - " \n", - " # Flatten the metric data for plotting\n", - " memory_data = []\n", - " for quant, flow in [(\"4-bit\", flow_4bit), (\"8-bit\", flow_8bit)]:\n", - " stats = flow.all_memory_stats\n", - " for collab, rounds_data in stats.items():\n", - " for round_name, metrics in rounds_data.items():\n", - " round_num = int(round_name.split('_')[1])\n", - " row = {\n", - " 'Collaborator': collab,\n", - " 'Round': round_num,\n", - " 'Quantization': quant,\n", - " 'Peak Memory (MB)': metrics.get('peak_max_allocated', 0),\n", - " 'Training Loss': metrics.get('training_loss', 0),\n", - " 'Eval Loss': metrics.get('eval_loss', 0)\n", - " }\n", - " memory_data.append(row)\n", - " \n", - " df = pd.DataFrame(memory_data)\n", - " \n", - " # Plot 1: Peak Memory Usage by Round\n", - " axs[0, 0].set_title('Peak Memory Usage by Round')\n", - " for quant_type in ['4-bit', '8-bit']:\n", - " for collab in df['Collaborator'].unique():\n", - " subset = df[(df['Quantization'] == quant_type) & (df['Collaborator'] == collab)]\n", - " color = colors_4bit[collab] if quant_type == '4-bit' else colors_8bit[collab]\n", - " marker = markers_4bit[collab] if quant_type == '4-bit' else markers_8bit[collab]\n", - " axs[0, 0].plot(subset['Round'], subset['Peak Memory (MB)'], \n", - " marker=marker, linestyle='-', label=f\"{collab} ({quant_type})\",\n", - " color=color)\n", - " \n", - " axs[0, 0].set_xlabel('Round')\n", - " axs[0, 0].set_ylabel('Memory (MB)')\n", - " axs[0, 0].legend()\n", - " axs[0, 0].grid(True, alpha=0.3)\n", - " axs[0, 0].yaxis.set_major_formatter(EngFormatter(unit='B'))\n", - " \n", - " # Plot 2: Training Loss by Round\n", - " axs[0, 1].set_title('Training Loss by Round')\n", - " for quant_type in ['4-bit', '8-bit']:\n", - " for collab in df['Collaborator'].unique():\n", - " subset = df[(df['Quantization'] == quant_type) & (df['Collaborator'] == collab)]\n", - " color = colors_4bit[collab] if quant_type == '4-bit' else colors_8bit[collab]\n", - " marker = markers_4bit[collab] if quant_type == '4-bit' else markers_8bit[collab]\n", - " axs[0, 1].plot(subset['Round'], subset['Training Loss'], \n", - " marker=marker, linestyle='-', label=f\"{collab} ({quant_type})\",\n", - " color=color)\n", - " \n", - " axs[0, 1].set_xlabel('Round')\n", - " axs[0, 1].set_ylabel('Loss')\n", - " axs[0, 1].legend()\n", - " axs[0, 1].grid(True, alpha=0.3)\n", - " \n", - " # Plot 3: Eval Loss by Round\n", - " axs[1, 0].set_title('Evaluation Loss by Round')\n", - " for quant_type in ['4-bit', '8-bit']:\n", - " for collab in df['Collaborator'].unique():\n", - " subset = df[(df['Quantization'] == quant_type) & (df['Collaborator'] == collab)]\n", - " color = colors_4bit[collab] if quant_type == '4-bit' else colors_8bit[collab]\n", - " marker = markers_4bit[collab] if quant_type == '4-bit' else markers_8bit[collab]\n", - " axs[1, 0].plot(subset['Round'], subset['Eval Loss'], \n", - " marker=marker, linestyle='-', label=f\"{collab} ({quant_type})\",\n", - " color=color)\n", - " \n", - " axs[1, 0].set_xlabel('Round')\n", - " axs[1, 0].set_ylabel('Loss')\n", - " axs[1, 0].legend()\n", - " axs[1, 0].grid(True, alpha=0.3)\n", - " \n", - " # Plot 4: Memory vs Loss (bubble chart)\n", - " axs[1, 1].set_title('Memory Usage vs. Evaluation Loss')\n", - " for quant_type in ['4-bit', '8-bit']:\n", - " for collab in df['Collaborator'].unique():\n", - " subset = df[(df['Quantization'] == quant_type) & (df['Collaborator'] == collab)]\n", - " color = colors_4bit[collab] if quant_type == '4-bit' else colors_8bit[collab]\n", - " marker = markers_4bit[collab] if quant_type == '4-bit' else markers_8bit[collab]\n", - " \n", - " # Size proportional to round number for visual differentiation\n", - " sizes = [100 * (r+1) for r in subset['Round']]\n", - " \n", - " axs[1, 1].scatter(subset['Peak Memory (MB)'], subset['Eval Loss'],\n", - " s=sizes, alpha=0.7, \n", - " label=f\"{collab} ({quant_type})\",\n", - " color=color, marker=marker)\n", - " \n", - " # Add round number annotations\n", - " for _, row in subset.iterrows():\n", - " axs[1, 1].annotate(f\"R{int(row['Round'])}\", \n", - " (row['Peak Memory (MB)'], row['Eval Loss']),\n", - " xytext=(5, 5), textcoords='offset points')\n", - " \n", - " axs[1, 1].set_xlabel('Peak Memory (MB)')\n", - " axs[1, 1].set_ylabel('Evaluation Loss')\n", - " axs[1, 1].legend()\n", - " axs[1, 1].grid(True, alpha=0.3)\n", - " axs[1, 1].xaxis.set_major_formatter(EngFormatter(unit='B'))\n", - " \n", - " plt.tight_layout()\n", - " plt.subplots_adjust(top=0.92)\n", - " plt.show()\n", - " \n", - " # Print summary comparison\n", - " print(\"\\n==== Performance Summary ====\\n\")\n", - " # Group by quantization and compute means\n", - " summary = df.groupby('Quantization').agg({\n", - " 'Peak Memory (MB)': 'mean',\n", - " 'Training Loss': 'mean', \n", - " 'Eval Loss': 'mean'\n", - " }).reset_index()\n", - " \n", - " # Calculate percentage difference\n", - " mem_diff_pct = ((summary.loc[1, 'Peak Memory (MB)'] - summary.loc[0, 'Peak Memory (MB)']) / \n", - " summary.loc[0, 'Peak Memory (MB)'] * 100)\n", - " \n", - " eval_diff_pct = ((summary.loc[1, 'Eval Loss'] - summary.loc[0, 'Eval Loss']) / \n", - " summary.loc[0, 'Eval Loss'] * 100)\n", - " \n", - " print(f\"Memory Usage Comparison:\")\n", - " print(f\" 4-bit Avg: {summary.loc[0, 'Peak Memory (MB)']:.2f} MB\")\n", - " print(f\" 8-bit Avg: {summary.loc[1, 'Peak Memory (MB)']:.2f} MB\")\n", - " print(f\" Difference: {abs(mem_diff_pct):.1f}% {'more' if mem_diff_pct > 0 else 'less'} memory with 8-bit\")\n", - " \n", - " print(f\"\\nEvaluation Loss Comparison:\")\n", - " print(f\" 4-bit Avg: {summary.loc[0, 'Eval Loss']:.4f}\")\n", - " print(f\" 8-bit Avg: {summary.loc[1, 'Eval Loss']:.4f}\")\n", - " print(f\" Difference: {abs(eval_diff_pct):.1f}% {'higher' if eval_diff_pct > 0 else 'lower'} loss with 8-bit\")\n", - " \n", - " loss_efficiency = ((summary.loc[0, 'Eval Loss'] - summary.loc[1, 'Eval Loss']) / \n", - " (summary.loc[0, 'Peak Memory (MB)'] - summary.loc[1, 'Peak Memory (MB)']))\n", - " \n", - " if loss_efficiency > 0:\n", - " efficiency_msg = \"8-bit provides better memory efficiency with lower loss\"\n", - " else:\n", - " efficiency_msg = \"4-bit provides better memory efficiency with lower loss\"\n", - " \n", - " print(f\"\\nEfficiency Analysis: {efficiency_msg}\")\n", - " except ImportError:\n", - " print(\"Plotting requires matplotlib and pandas. Install with: pip install matplotlib pandas\")\n", - " except Exception as e:\n", - " print(f\"Error plotting metrics: {str(e)}\")\n", + "import os\n", + "import math\n", + "import time\n", "\n", "class FederatedFlow(FLSpec):\n", " def __init__(self, model=None, optimizer=None, rounds=3, quant_type=\"4bit\", **kwargs):\n", " \"\"\"\n", - " Initialize the class with the given model, optimizer, and number of rounds.\n", + " Initialize the class with the given model, optimizer, and training parameters.\n", "\n", " Parameters:\n", " model (torch.nn.Module, optional): The model to be used. If None, a ValueError is raised.\n", @@ -803,6 +594,8 @@ " self.average_loss_history = []\n", " self.agg_model_loss_history = []\n", " self.local_model_loss_history = []\n", + " # Dictionary to store optimizer states for each collaborator\n", + " self.optimizer_states = {}\n", " \n", "\n", " @aggregator\n", @@ -815,8 +608,10 @@ " of the federated learning process.\n", " \"\"\"\n", " print(f\"Performing initialization for model with {self.quant_type} quantization\")\n", + " print(f\"Using {self.rounds} main rounds with partial round updates\")\n", " self.collaborators = self.runtime.collaborators\n", " self.current_round = 0\n", + " self.current_sub_round = 0\n", " # Initialize dictionary to collect memory stats\n", " # Check if collaborators are objects with name attribute or strings\n", " if hasattr(self.collaborators[0], 'name'):\n", @@ -825,6 +620,8 @@ " # If collaborators are already strings, use them directly\n", " collab_names = self.collaborators\n", " self.all_memory_stats = {collab: {} for collab in collab_names}\n", + " # Initialize optimizer states dictionary for each collaborator\n", + " self.optimizer_states = {collab: None for collab in collab_names}\n", " self.next(\n", " self.aggregated_model_validation,\n", " foreach=\"collaborators\",\n", @@ -840,7 +637,7 @@ " the model using the provided training and evaluation datasets. The validation\n", " score is then stored and the next step in the process is triggered.\n", " \"\"\"\n", - " print(f\"Performing aggregated model validation for collaborator {self.input} with {self.quant_type}\")\n", + " print(f\"[Round {self.current_round}, Update {self.current_sub_round}] Performing aggregated model validation for collaborator {self.input} with {self.quant_type}\")\n", " # Initialize memory tracker for this collaborator\n", " self.memory_tracker = MemoryTracker(self.input, self.quant_type)\n", " self.memory_tracker.reset_peak()\n", @@ -852,7 +649,6 @@ " quant_config = bnb_config_8bit\n", " \n", " # Define device_map variable\n", - " #device_map = \"auto\"\n", " device_map = {\"\": torch.cuda.current_device()} if torch.cuda.is_available() else \"cpu\"\n", " try:\n", " self.model = AutoModelForCausalLM.from_pretrained(\n", @@ -876,9 +672,34 @@ " self.model = get_peft_model(self.model, peft_config)\n", " set_peft_model_state_dict(self.model, self.peft_params)\n", " \n", + " # Use fixed number of steps (max_steps) for each round\n", + " steps_per_round = training_args.max_steps # Use the hardcoded 100 steps\n", + " \n", + " # Create a custom TrainingArguments for this round\n", + " self.round_args = TrainingArguments(\n", + " output_dir=training_args.output_dir,\n", + " max_steps=steps_per_round, # Use fixed steps per round\n", + " per_device_train_batch_size=training_args.per_device_train_batch_size,\n", + " gradient_accumulation_steps=training_args.gradient_accumulation_steps,\n", + " optim=training_args.optim,\n", + " save_steps=steps_per_round // 2 or 1, # More frequent saving\n", + " logging_steps=5,\n", + " learning_rate=training_args.learning_rate,\n", + " weight_decay=training_args.weight_decay,\n", + " fp16=training_args.fp16,\n", + " bf16=training_args.bf16,\n", + " max_grad_norm=training_args.max_grad_norm,\n", + " warmup_ratio=training_args.warmup_ratio,\n", + " lr_scheduler_type=training_args.lr_scheduler_type,\n", + " gradient_checkpointing=training_args.gradient_checkpointing,\n", + " report_to=training_args.report_to\n", + " )\n", + " \n", + " print(f\"[{self.input}] Training with {steps_per_round} steps\")\n", + " \n", " trainer = SFTTrainer(\n", " model=self.model,\n", - " args=training_args,\n", + " args=self.round_args, # Use round specific args\n", " peft_config=peft_config,\n", " train_dataset=self.train_dataset,\n", " eval_dataset=self.eval_dataset,\n", @@ -894,7 +715,7 @@ " trainer.remove_callback(PrinterCallback)\n", " out = trainer.evaluate()\n", " self.agg_validation_score = out[\"eval_loss\"]\n", - " print(f\"{self.input} value of {self.agg_validation_score}\")\n", + " print(f\"{self.input} evaluation loss: {self.agg_validation_score}\")\n", " self.memory_tracker.log_loss(eval_loss=self.agg_validation_score) # Log eval loss\n", " self.memory_tracker.update_peak()\n", " self.next(self.train)\n", @@ -902,34 +723,207 @@ " @collaborator\n", " def train(self):\n", " \"\"\"\n", - " Train the model for a collaborator.\n", + " Train the model for a collaborator with partial epoch updates.\n", "\n", - " This method trains the model using the provided training and evaluation datasets.\n", - " The training loss is stored, the model is saved, and the next step in the process\n", - " is triggered.\n", + " This method trains the model using the provided training dataset,\n", + " but processes it in smaller chunks (partial epochs) to allow more\n", + " frequent parameter sharing between collaborators.\n", " \"\"\"\n", " self.memory_tracker.log(\"before_training\")\n", + " \n", + " # Reduce steps for 8-bit quantization\n", + " if self.quant_type == \"8bit\":\n", + " max_steps = training_args.max_steps // 2 # Half the steps for 8-bit\n", + " else:\n", + " max_steps = training_args.max_steps\n", + " \n", + " # Define partial training args\n", + " self.sub_round_args = TrainingArguments(\n", + " output_dir=training_args.output_dir,\n", + " max_steps=max_steps,\n", + " per_device_train_batch_size=training_args.per_device_train_batch_size,\n", + " gradient_accumulation_steps=training_args.gradient_accumulation_steps,\n", + " optim=training_args.optim,\n", + " save_steps=max_steps // 4,\n", + " logging_steps=2,\n", + " learning_rate=training_args.learning_rate,\n", + " weight_decay=training_args.weight_decay,\n", + " fp16=training_args.fp16,\n", + " bf16=training_args.bf16,\n", + " max_grad_norm=training_args.max_grad_norm,\n", + " warmup_ratio=training_args.warmup_ratio,\n", + " lr_scheduler_type=training_args.lr_scheduler_type,\n", + " gradient_checkpointing=training_args.gradient_checkpointing,\n", + " report_to=training_args.report_to,\n", + " auto_find_batch_size=True # Add auto batch size finding\n", + " )\n", + " \n", + " # Create trainer instance with our custom training args\n", " trainer = SFTTrainer(\n", " model=self.model,\n", - " args=training_args,\n", + " args=self.sub_round_args,\n", " peft_config=peft_config,\n", " train_dataset=self.train_dataset,\n", " eval_dataset=self.eval_dataset,\n", - " max_seq_length=1024,\n", + " max_seq_length=512, # Reduced sequence length\n", " dataset_text_field=\"text\",\n", " tokenizer=tokenizer,\n", - " packing=True,\n", - " data_collator=transformers.DataCollatorForSeq2Seq(\n", - " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n", - " ),\n", " )\n", - "\n", - " out = trainer.train()\n", - " self.loss = out.training_loss\n", + " \n", + " # Make sure optimizer is initialized before training\n", + " if not hasattr(trainer, 'optimizer') or trainer.optimizer is None:\n", + " trainer.create_optimizer_and_scheduler(num_training_steps=self.sub_round_args.max_steps)\n", + " if trainer.optimizer is None:\n", + " print(f\"[{self.input}] Warning: Failed to create optimizer. Creating standard optimizer.\")\n", + " # Create a simple optimizer if trainer.create_optimizer_and_scheduler failed\n", + " from torch.optim import AdamW\n", + " trainer.optimizer = AdamW(\n", + " trainer.model.parameters(),\n", + " lr=self.sub_round_args.learning_rate,\n", + " weight_decay=self.sub_round_args.weight_decay\n", + " )\n", + " # Create a simple scheduler\n", + " from transformers import get_scheduler\n", + " trainer.lr_scheduler = get_scheduler(\n", + " name=self.sub_round_args.lr_scheduler_type,\n", + " optimizer=trainer.optimizer,\n", + " num_warmup_steps=int(self.sub_round_args.max_steps * self.sub_round_args.warmup_ratio),\n", + " num_training_steps=self.sub_round_args.max_steps,\n", + " )\n", + " \n", + " # Restore optimizer state if available from previous rounds\n", + " if self.optimizer_states.get(self.input) is not None:\n", + " print(f\"[{self.input}] Restoring optimizer state\")\n", + " try:\n", + " # Load the optimizer state\n", + " trainer.optimizer.load_state_dict(self.optimizer_states[self.input])\n", + " except Exception as e:\n", + " print(f\"Failed to restore optimizer state: {e}\")\n", + " \n", + " # For 8-bit quantization with limited GPU memory, use simplified training\n", + " if self.quant_type == \"8bit\":\n", + " # Simplify training for 8-bit\n", + " try:\n", + " # Use trainer.train() for simpler training flow\n", + " trainer.train(resume_from_checkpoint=False)\n", + " # Get the last loss\n", + " self.loss = trainer.state.log_history[-1].get('loss', float('inf'))\n", + " except Exception as e:\n", + " print(f\"Training failed with error: {str(e)}\")\n", + " self.loss = float('inf') # Set to infinity to indicate failure\n", + " else:\n", + " # Regular training with manual control for 4-bit\n", + " print(f\"[{self.input}] Starting partial epoch training with {max_steps} steps\")\n", + " trainer.model.train()\n", + " total_loss = 0\n", + " step_count = 0\n", + " \n", + " # Set up dataloader for manual batching\n", + " dataloader = trainer.get_train_dataloader()\n", + " \n", + " # Process batches manually for more control\n", + " for step, inputs in enumerate(dataloader):\n", + " # Move inputs to the appropriate device\n", + " inputs = {k: v.to(trainer.args.device) for k, v in inputs.items()}\n", + " \n", + " # Forward pass\n", + " outputs = trainer.model(**inputs)\n", + " \n", + " # Handle different output formats\n", + " if isinstance(outputs, dict):\n", + " if \"loss\" in outputs:\n", + " loss = outputs[\"loss\"] / trainer.args.gradient_accumulation_steps\n", + " else:\n", + " # Calculate loss manually if not provided in outputs\n", + " # Get logits from outputs\n", + " if \"logits\" in outputs:\n", + " logits = outputs[\"logits\"]\n", + " # Get labels from inputs\n", + " labels = inputs.get(\"labels\")\n", + " if labels is not None:\n", + " # Calculate loss using cross-entropy\n", + " import torch.nn.functional as F\n", + " # Shift logits and labels for causal LM\n", + " shift_logits = logits[..., :-1, :].contiguous()\n", + " shift_labels = labels[..., 1:].contiguous()\n", + " loss = F.cross_entropy(\n", + " shift_logits.view(-1, shift_logits.size(-1)),\n", + " shift_labels.view(-1),\n", + " ignore_index=-100\n", + " ) / trainer.args.gradient_accumulation_steps\n", + " else:\n", + " print(f\"Warning: No labels in inputs, using dummy loss\")\n", + " loss = (outputs[\"logits\"].sum() * 0.0) / trainer.args.gradient_accumulation_steps\n", + " else:\n", + " print(f\"Warning: No logits or loss in outputs, using dummy loss\")\n", + " # Use any tensor from outputs for a dummy loss\n", + " dummy_tensor = next(iter(outputs.values()))\n", + " loss = (dummy_tensor.sum() * 0.0) / trainer.args.gradient_accumulation_steps\n", + " else:\n", + " loss = outputs.loss / trainer.args.gradient_accumulation_steps\n", + " \n", + " total_loss += loss.detach().float()\n", + " \n", + " # Backward pass\n", + " loss.backward()\n", + " \n", + " # Update weights on gradient accumulation steps or at the end\n", + " if ((step + 1) % trainer.args.gradient_accumulation_steps == 0) or (step == len(dataloader) - 1):\n", + " # Double check optimizer exists before using it\n", + " if trainer.optimizer is None:\n", + " print(f\"[{self.input}] Warning: Optimizer is None at step {step}. Creating optimizer.\")\n", + " # Create a simple optimizer\n", + " from torch.optim import AdamW\n", + " trainer.optimizer = AdamW(\n", + " trainer.model.parameters(), \n", + " lr=self.sub_round_args.learning_rate,\n", + " weight_decay=self.sub_round_args.weight_decay\n", + " )\n", + " # Create a simple scheduler\n", + " from transformers import get_scheduler\n", + " trainer.lr_scheduler = get_scheduler(\n", + " name=self.sub_round_args.lr_scheduler_type,\n", + " optimizer=trainer.optimizer,\n", + " num_warmup_steps=int(self.sub_round_args.max_steps * self.sub_round_args.warmup_ratio),\n", + " num_training_steps=self.sub_round_args.max_steps,\n", + " )\n", + " \n", + " trainer.optimizer.step()\n", + " trainer.lr_scheduler.step()\n", + " trainer.optimizer.zero_grad()\n", + " step_count += 1\n", + " \n", + " # Log progress\n", + " if step_count > 0 and step_count % 10 == 0:\n", + " print(f\"[{self.input}] Completed {step_count} steps, current loss: {total_loss/step_count:.4f}\")\n", + " \n", + " # Stop after max_steps\n", + " if step_count >= max_steps:\n", + " break\n", + " \n", + " # Calculate final training loss\n", + " self.loss = total_loss / step_count if step_count > 0 else 0\n", + " \n", + " print(f\"[{self.input}] Training completed, average loss: {self.loss:.4f}\")\n", + " \n", + " # Log memory and training metrics\n", " self.memory_tracker.log(\"after_training\")\n", - " self.memory_tracker.log_loss(training_loss=self.loss) # Log training loss\n", + " self.memory_tracker.log_loss(training_loss=self.loss)\n", " self.memory_tracker.update_peak()\n", - " trainer.save_model()\n", + " \n", + " # Save optimizer state for next round\n", + " if hasattr(trainer, 'optimizer') and trainer.optimizer is not None:\n", + " self.optimizer_states[self.input] = trainer.optimizer.state_dict()\n", + " # Create directory for saving optimizer state if needed\n", + " os.makedirs(f\"./optimizer_state/{self.input}\", exist_ok=True)\n", + " # Save optimizer state to disk as backup\n", + " torch.save(\n", + " trainer.optimizer.state_dict(), \n", + " f\"./optimizer_state/{self.input}/optimizer_round_{self.current_round}_update_{self.current_sub_round}.pt\"\n", + " )\n", + " \n", + " # Save model checkpoint\n", + " trainer.save_model(f\"./local_models/{self.input}/round_{self.current_round}_update_{self.current_sub_round}\")\n", " self.training_completed = True\n", " self.next(self.local_model_validation)\n", "\n", @@ -944,7 +938,7 @@ " \"\"\"\n", " trainer = SFTTrainer(\n", " model=self.model,\n", - " args=training_args,\n", + " args=self.sub_round_args, # Use sub-round specific args\n", " peft_config=peft_config,\n", " train_dataset=self.train_dataset,\n", " eval_dataset=self.eval_dataset,\n", @@ -958,6 +952,7 @@ " )\n", " out = trainer.evaluate()\n", " self.local_validation_score = out[\"eval_loss\"]\n", + " print(f\"[{self.input}] Local evaluation loss: {self.local_validation_score}\")\n", " self.memory_tracker.log_loss(eval_loss=self.local_validation_score) # Log eval loss\n", " self.peft_params = get_peft_model_state_dict(self.model)\n", " print(f\"Doing local model validation for collaborator {self.input}\")\n", @@ -984,10 +979,10 @@ " input.local_validation_score for input in inputs\n", " ) / len(inputs)\n", " print(\n", - " f\"Average aggregated model validation values = {self.aggregated_model_accuracy}\"\n", + " f\"[Round {self.current_round}, Update {self.current_sub_round}] Average aggregated model validation loss = {self.aggregated_model_accuracy}\"\n", " )\n", - " print(f\"Average training loss = {self.average_loss}\")\n", - " print(f\"Average local model validation values = {self.local_model_accuracy}\")\n", + " print(f\"[Round {self.current_round}, Update {self.current_sub_round}] Average training loss = {self.average_loss}\")\n", + " print(f\"[Round {self.current_round}, Update {self.current_sub_round}] Average local model validation loss = {self.local_model_accuracy}\")\n", "\n", " # Store metrics in history for plotting trends\n", " self.average_loss_history.append(self.average_loss)\n", @@ -996,14 +991,36 @@ " \n", " # Collect memory stats from all collaborators for this round\n", " for input_data in inputs:\n", - " self.all_memory_stats[input_data.input][f\"round_{self.current_round}\"] = input_data.memory_stats\n", + " round_key = f\"round_{self.current_round}_update_{self.current_sub_round}\"\n", + " self.all_memory_stats[input_data.input][round_key] = input_data.memory_stats\n", + " # Update optimizer states from collaborators\n", + " if hasattr(input_data, 'optimizer_states') and input_data.optimizer_states.get(input_data.input) is not None:\n", + " self.optimizer_states[input_data.input] = input_data.optimizer_states[input_data.input]\n", "\n", + " # Save aggregated optimizer states for debug/analysis\n", + " os.makedirs(\"./optimizer_state/aggregator\", exist_ok=True)\n", + " torch.save(\n", + " self.optimizer_states, \n", + " f\"./optimizer_state/aggregator/optimizers_round_{self.current_round}_update_{self.current_sub_round}.pt\"\n", + " )\n", + " \n", " self.model = FedAvg([input.peft_params for input in inputs], self.model)\n", " self.peft_params = get_peft_model_state_dict(self.model)\n", "\n", - " self.model.save_pretrained(\"./aggregated/model\")\n", - " tokenizer.save_pretrained(\"./aggregated/tokenizer\")\n", - " self.current_round += 1\n", + " # Save aggregated model after each sub-round update\n", + " save_dir = f\"./aggregated/model_round_{self.current_round}_update_{self.current_sub_round}\"\n", + " os.makedirs(save_dir, exist_ok=True)\n", + " self.model.save_pretrained(save_dir)\n", + " tokenizer.save_pretrained(f\"./aggregated/tokenizer_round_{self.current_round}_update_{self.current_sub_round}\")\n", + " \n", + " # Update round and sub-round counters\n", + " self.current_sub_round += 1\n", + " # Each update is treated as a partial update within a full round\n", + " # Update main round counter and reset sub-round when appropriate\n", + " if self.current_sub_round >= 2: # Default to 2 partial updates per round\n", + " self.current_sub_round = 0\n", + " self.current_round += 1\n", + " \n", " if self.current_round < self.rounds:\n", " self.next(\n", " self.aggregated_model_validation,\n", @@ -1027,11 +1044,24 @@ " print(f\"Final Aggregated Model Loss: {self.agg_model_loss_history[-1]:.4f}\")\n", " print(f\"Final Local Model Loss: {self.local_model_loss_history[-1]:.4f}\")\n", " \n", + " print(\"\\n===== Metric History =====\\n\")\n", + " print(\"Training Loss History:\")\n", + " for i, loss in enumerate(self.average_loss_history):\n", + " print(f\" Update {i}: {loss:.4f}\")\n", + " \n", + " print(\"\\nAggregated Model Loss History:\")\n", + " for i, loss in enumerate(self.agg_model_loss_history):\n", + " print(f\" Update {i}: {loss:.4f}\")\n", + " \n", + " print(\"\\nLocal Model Loss History:\")\n", + " for i, loss in enumerate(self.local_model_loss_history):\n", + " print(f\" Update {i}: {loss:.4f}\")\n", + " \n", " print(\"\\n===== Memory Usage Summary Across All Rounds =====\\n\")\n", " \n", " # Print aggregated memory statistics\n", " for collab, rounds_data in self.all_memory_stats.items():\n", - " print(f\"\\n==== {collab} Memory Usage Across Rounds ({self.quant_type}) ====\\n\")\n", + " print(f\"\\n==== {collab} Memory Usage Across Rounds/Updates ({self.quant_type}) ====\\n\")\n", " for round_name, stats in rounds_data.items():\n", " print(f\" {round_name}:\")\n", " for metric, value in stats.items():\n", @@ -1051,14 +1081,16 @@ "id": "7bc8fe27", "metadata": {}, "source": [ - "## Run Federated Learning with 4-bit and 8-bit Quantization" + "## Run Federated Learning with 4-bit Quantization" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "id": "38894111-41d9-4dd4-b1c8-eb7ec3cdd3e1", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [ { "name": "stdout", @@ -1070,9 +1102,10 @@ "\n", "Calling start\n", "\u001b[94mPerforming initialization for model with 4bit quantization\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mUsing 5 main rounds with partial round updates\u001b[0m\u001b[94m\n", "\u001b[0m\n", "Calling aggregated_model_validation\n", - "\u001b[94mPerforming aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n", + "\u001b[94m[Round 0, Update 0] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n", "\u001b[0m" ] }, @@ -1080,8 +1113,23 @@ "name": "stderr", "output_type": "stream", "text": [ - "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.27it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", "\n", "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", " warnings.warn(message, FutureWarning)\n", @@ -1091,8 +1139,11 @@ " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", - "Generating train split: 915 examples [00:01, 626.55 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", - "Generating train split: 96 examples [00:00, 570.74 examples/s]\u001b[0m\u001b[94m0m\u001b[94m\n", + "Generating train split: 912 examples [00:01, 622.96 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "Generating train split: 103 examples [00:00, 627.49 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", "\u001b[0m" ] }, @@ -1102,8 +1153,8 @@ "\n", "

\n", " \n", - " \n", - " [12/12 00:11]\n", + " \n", + " [13/13 00:12]\n", "
\n", " " ], @@ -1118,7 +1169,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[94mPortland value of 0.5819987058639526\u001b[0m\u001b[94m\n", + "\u001b[94mPortland evaluation loss: 0.5811071991920471\u001b[0m\u001b[94m\n", "\u001b[0m\n", "Calling train\n" ] @@ -1127,141 +1178,63 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", "\n", "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", " warnings.warn(message, FutureWarning)\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", - "Generating train split: 915 examples [00:01, 619.44 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", - "Generating train split: 96 examples [00:00, 628.10 examples/s]\u001b[0m\u001b[94m0m\u001b[94m\n", - "\u001b[0m`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...\n" + "Map: 100%|##########| 4464/4464 [00:01<00:00, 3487.12 examples/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "Map: 100%|##########| 496/496 [00:00<00:00, 3341.29 examples/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n", + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" ] }, - { - "data": { - "text/html": [ - "\n", - "
\n", - " \n", - " \n", - " [229/229 09:05, Epoch 1/1]\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
StepTraining Loss
100.501600
200.373200
300.337300
400.348300
500.334300
600.331800
700.322600
800.325800
900.320700
1000.328900
1100.303000
1200.313100
1300.312800
1400.320100
1500.308100
1600.312800
1700.325200
1800.307200
1900.320100
2000.310600
2100.308200
2200.310700

" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", "text": [ - "\n", + "\u001b[94m[Portland] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.5313\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.5340\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.5390\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.5433\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.5464\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.5491\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.5525\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.5587\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.4287\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.4316\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.4328\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.4357\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.4377\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.4393\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.4421\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.4445\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.3906\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.3917\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.3931\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.3945\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.3967\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.3976\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.3993\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.4006\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.3724\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.3728\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.3738\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.3752\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.3763\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.3773\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.3783\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.3791\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.3572\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.3572\u001b[0m\u001b[94m\n", + "\u001b[0m\n", "Calling local_model_validation\n" ] }, @@ -1279,6 +1252,11 @@ " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", + "Generating train split: 912 examples [00:01, 618.15 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "Generating train split: 103 examples [00:00, 632.61 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", "\u001b[0m" ] }, @@ -1288,8 +1266,8 @@ "\n", "

\n", " \n", - " \n", - " [12/12 00:12]\n", + " \n", + " [13/13 00:13]\n", "
\n", " " ], @@ -1304,36 +1282,37 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[94m[Portland] Local evaluation loss: 0.3916991055011749\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 35323.02 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 60420.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 57461.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 32696.12 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57374.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 54745.21 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", "Memory Usage by Stage:\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 32979.33 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 45302.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 48011.95 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 30342.61 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 41548.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 43451.02 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 35078.37 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 60420.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 57461.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 32388.53 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57268.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 54745.21 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 35323.02 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 41244.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 57461.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 32696.12 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57374.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 54745.21 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", "Performance Metrics:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Training Loss: 0.3295\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Evaluation Loss: 0.3029\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.3572\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.3917\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", "\u001b[0mShould transfer from local_model_validation to join\n", "\n", "Calling aggregated_model_validation\n", - "\u001b[94mPerforming aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n", + "\u001b[94m[Round 0, Update 0] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n", "\u001b[0m" ] }, @@ -1342,7 +1321,22 @@ "output_type": "stream", "text": [ "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", "\n", "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", " warnings.warn(message, FutureWarning)\n", @@ -1352,8 +1346,11 @@ " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", - "Generating train split: 917 examples [00:01, 622.35 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", - "Generating train split: 100 examples [00:00, 631.84 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "Generating train split: 912 examples [00:01, 631.44 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "Generating train split: 101 examples [00:00, 625.21 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", "\u001b[0m" ] }, @@ -1364,7 +1361,7 @@ "
\n", " \n", " \n", - " [13/13 00:13]\n", + " [13/13 00:12]\n", "
\n", " " ], @@ -1379,7 +1376,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[94mSeattle value of 0.5914124846458435\u001b[0m\u001b[94m\n", + "\u001b[94mSeattle evaluation loss: 0.5805659294128418\u001b[0m\u001b[94m\n", "\u001b[0m\n", "Calling train\n" ] @@ -1388,139 +1385,62 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", "\n", "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", " warnings.warn(message, FutureWarning)\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", + "Map: 100%|##########| 4463/4463 [00:01<00:00, 3422.90 examples/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "Map: 100%|##########| 496/496 [00:00<00:00, 3404.21 examples/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", "\u001b[0m" ] }, - { - "data": { - "text/html": [ - "\n", - "
\n", - " \n", - " \n", - " [229/229 09:07, Epoch 0/1]\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
StepTraining Loss
100.487200
200.371700
300.346800
400.334800
500.324900
600.319300
700.331200
800.336300
900.310900
1000.308100
1100.313600
1200.312600
1300.318200
1400.323600
1500.304700
1600.323100
1700.305100
1800.314500
1900.303500
2000.314900
2100.327900
2200.314100

" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", "text": [ - "\n", + "\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.5439\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.5469\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.5501\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.5523\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.5550\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.5580\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.5638\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.5660\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.4307\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.4329\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.4347\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.4358\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.4375\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.4432\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.4451\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.4487\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.3982\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.3988\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.3996\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.4012\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.4025\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.4033\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.4041\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.4052\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.3717\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.3723\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.3728\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.3741\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.3749\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.3755\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.3769\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.3778\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.3601\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.3601\u001b[0m\u001b[94m\n", + "\u001b[0m\n", "Calling local_model_validation\n" ] }, @@ -1538,6 +1458,10 @@ " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", + "Generating train split: 912 examples [00:01, 639.46 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", "\u001b[0m" ] }, @@ -1563,41 +1487,42 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[94m[Seattle] Local evaluation loss: 0.3990606665611267\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 23170.02 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 48256.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 21814.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 47222.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 54883.62 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", "Memory Usage by Stage:\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 20890.32 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 33280.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 19625.16 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 30754.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 54883.62 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 22957.36 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 48256.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 21639.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 47114.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 54883.62 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 23170.02 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 29538.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 21814.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 47222.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 54883.62 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", "Performance Metrics:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Training Loss: 0.3287\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Evaluation Loss: 0.3178\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.3601\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.3991\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", "\u001b[0mShould transfer from local_model_validation to join\n", "\n", "Calling join\n", - "\u001b[94mAverage aggregated model validation values = 0.5867055952548981\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94mAverage training loss = 0.3290792714039832\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94mAverage local model validation values = 0.310357466340065\u001b[0m\u001b[94m\n", + "\u001b[94m[Round 0, Update 0] Average aggregated model validation loss = 0.5808365643024445\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 0, Update 0] Average training loss = 0.35865288972854614\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 0, Update 0] Average local model validation loss = 0.3953798860311508\u001b[0m\u001b[94m\n", "\u001b[0m\n", "Calling aggregated_model_validation\n", - "\u001b[94mPerforming aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n", + "\u001b[94m[Round 0, Update 1] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n", "\u001b[0m" ] }, @@ -1605,8 +1530,23 @@ "name": "stderr", "output_type": "stream", "text": [ - "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.28it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", "\n", "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", " warnings.warn(message, FutureWarning)\n", @@ -1616,6 +1556,10 @@ " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", + "Generating train split: 912 examples [00:01, 650.05 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", "\u001b[0m" ] }, @@ -1625,8 +1569,8 @@ "\n", "

\n", " \n", - " \n", - " [12/12 00:12]\n", + " \n", + " [13/13 00:13]\n", "
\n", " " ], @@ -1641,7 +1585,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[94mPortland value of 0.30139902234077454\u001b[0m\u001b[94m\n", + "\u001b[94mPortland evaluation loss: 0.38848692178726196\u001b[0m\u001b[94m\n", "\u001b[0m\n", "Calling train\n" ] @@ -1650,139 +1594,62 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", "\n", "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", " warnings.warn(message, FutureWarning)\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", + "Map: 100%|##########| 496/496 [00:00<00:00, 3328.62 examples/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", "\u001b[0m" ] }, - { - "data": { - "text/html": [ - "\n", - "
\n", - " \n", - " \n", - " [229/229 09:06, Epoch 1/1]\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
StepTraining Loss
100.298900
200.293900
300.279100
400.293300
500.290500
600.291100
700.280500
800.289900
900.289200
1000.298400
1100.275500
1200.290700
1300.290200
1400.301600
1500.290300
1600.299000
1700.313100
1800.297100
1900.313400
2000.303800
2100.302100
2200.306200

" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", "text": [ - "\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2647\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2668\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2703\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2734\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2761\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2782\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2813\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2857\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2647\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2676\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2687\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2708\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2726\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2740\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2758\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2780\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2695\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2704\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2717\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2729\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2748\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2756\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2772\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2783\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2720\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2724\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2732\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2745\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2752\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2762\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2770\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2778\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.2645\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.2645\u001b[0m\u001b[94m\n", + "\u001b[0m\n", "Calling local_model_validation\n" ] }, @@ -1800,6 +1667,10 @@ " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", + "Generating train split: 103 examples [00:00, 645.41 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", "\u001b[0m" ] }, @@ -1809,8 +1680,8 @@ "\n", "

\n", " \n", - " \n", - " [12/12 00:12]\n", + " \n", + " [13/13 00:13]\n", "
\n", " " ], @@ -1825,36 +1696,37 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[94m[Portland] Local evaluation loss: 0.3903903663158417\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 35535.52 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 59824.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33547.22 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57726.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55734.72 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", "Memory Usage by Stage:\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 33255.83 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 33440.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31364.12 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31586.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 54883.62 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 35322.87 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 59824.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33378.03 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57670.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55734.72 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 35535.52 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 41948.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 57706.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33547.22 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57726.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55734.72 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", "Performance Metrics:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Training Loss: 0.2949\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Evaluation Loss: 0.2986\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.2645\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.3904\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", "\u001b[0mShould transfer from local_model_validation to join\n", "\n", "Calling aggregated_model_validation\n", - "\u001b[94mPerforming aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n", + "\u001b[94m[Round 0, Update 1] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n", "\u001b[0m" ] }, @@ -1862,7 +1734,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.27it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", "\n", "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", @@ -1873,6 +1745,23 @@ " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", "\u001b[0m" ] }, @@ -1898,11 +1787,74 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[94mSeattle value of 0.3157660961151123\u001b[0m\u001b[94m\n", + "\u001b[94mSeattle evaluation loss: 0.3931271433830261\u001b[0m\u001b[94m\n", "\u001b[0m\n", "Calling train\n" ] }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "Map: 100%|##########| 496/496 [00:00<00:00, 3406.01 examples/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2763\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2785\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2815\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2831\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2853\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2878\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2919\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2938\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2690\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2709\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2725\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2735\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2749\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2798\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2815\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2844\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2730\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2736\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2743\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2757\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2770\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2778\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2785\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2795\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2696\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2701\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2707\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2717\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2725\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2730\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2740\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2748\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.2659\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.2659\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling local_model_validation\n" + ] + }, { "name": "stderr", "output_type": "stream", @@ -1917,6 +1869,10 @@ " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", + "Generating train split: 101 examples [00:00, 653.25 examples/s]\u001b[0m\u001b[94mm\u001b[94m\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", "\u001b[0m" ] }, @@ -1926,104 +1882,8265 @@ "\n", "
\n", " \n", - " \n", - " [229/229 09:07, Epoch 0/1]\n", + " \n", + " [13/13 00:13]\n", "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.3981446623802185\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33447.23 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57866.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31364.12 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31506.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33378.03 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57810.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33447.23 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57866.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.2659\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.3981\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 0, Update 1] Average aggregated model validation loss = 0.39080703258514404\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 0, Update 1] Average training loss = 0.2652348279953003\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 0, Update 1] Average local model validation loss = 0.3942675143480301\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 1, Update 0] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.23it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.38319262862205505\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Completed 10 steps, current loss: 0.2195\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2213\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2243\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2270\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2291\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2310\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2341\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.2371\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2230\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2253\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2263\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2280\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2298\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2311\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2324\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.2346\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2331\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2338\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2351\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2361\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2378\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2385\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2398\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2408\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2375\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2379\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2386\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2398\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2404\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2413\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2421\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2428\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.2315\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.2315\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.392806738615036\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.76 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58286.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.66 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31766.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.57 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58230.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.76 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58286.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.2315\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.3928\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 1, Update 0] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.3884606957435608\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2279\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2298\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2329\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2345\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2363\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2382\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2412\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.2428\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2262\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2280\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2295\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2305\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2317\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2359\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2375\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.2394\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2341\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2347\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2354\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2367\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2378\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2386\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2392\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2402\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2352\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2356\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2361\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2369\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2376\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2381\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2390\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2397\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.2306\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.2306\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.4016312062740326\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58166.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.66 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31784.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.57 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58110.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58166.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.2306\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4016\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 1, Update 0] Average aggregated model validation loss = 0.3858266621828079\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 1, Update 0] Average training loss = 0.23103156685829163\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 1, Update 0] Average local model validation loss = 0.3972189724445343\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 1, Update 1] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.38549479842185974\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Completed 10 steps, current loss: 0.1714\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1729\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1755\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1777\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1793\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1809\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1836\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1854\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1804\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1824\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1834\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1847\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1863\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1876\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1888\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1907\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1972\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1979\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1991\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2000\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2014\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2021\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2032\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.2043\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2048\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2052\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2057\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2068\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2073\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2081\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2088\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.2095\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.1995\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.1995\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.401896208524704\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.76 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58226.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.66 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31748.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.58 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58170.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.76 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58226.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.1995\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4019\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 1, Update 1] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.3921719491481781\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1773\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1793\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1823\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1835\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1851\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1866\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1891\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1905\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1832\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1849\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1863\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1870\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1881\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1913\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1927\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1940\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1960\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1966\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1973\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1984\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1993\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2001\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2007\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.2017\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1996\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2001\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2005\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2011\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2018\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2023\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2031\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.2037\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.1968\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.1968\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.413899302482605\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57886.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.66 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31770.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.58 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57830.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57886.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.1968\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4139\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 1, Update 1] Average aggregated model validation loss = 0.3888333737850189\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 1, Update 1] Average training loss = 0.19815459847450256\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 1, Update 1] Average local model validation loss = 0.4078977555036545\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 2, Update 0] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.39401885867118835\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Completed 10 steps, current loss: 0.1237\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1246\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1261\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1270\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1279\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1287\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1301\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1316\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1355\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1373\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1381\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1395\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1409\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1420\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1430\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.1444\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1612\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1619\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1628\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1637\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1649\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1656\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1666\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1675\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1733\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1737\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1742\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1751\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1756\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1762\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1768\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1775\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.1710\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.1710\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.41375958919525146\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58186.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.67 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31838.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.59 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58130.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58186.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.1710\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4138\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 2, Update 0] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.40143540501594543\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1366\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1386\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1411\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1418\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1435\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1452\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1468\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1478\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1464\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1478\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1488\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1494\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1505\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1524\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1537\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1551\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1653\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1658\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1664\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1673\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1681\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1689\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1695\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1704\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1743\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1748\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1751\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1757\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1764\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1768\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1775\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1780\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.1720\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.1720\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.42323189973831177\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58206.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.67 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31798.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.59 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58150.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58206.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.1720\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4232\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 2, Update 0] Average aggregated model validation loss = 0.3977271318435669\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 2, Update 0] Average training loss = 0.17152628302574158\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 2, Update 0] Average local model validation loss = 0.4184957444667816\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 2, Update 1] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.40234553813934326\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Completed 10 steps, current loss: 0.0984\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0989\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0996\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1004\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1011\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1018\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1027\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.1035\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.1473\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.1473\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.4498145878314972\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57986.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31762.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.59 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57930.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57986.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.1473\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4498\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 2, Update 1] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.41048040986061096\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1032\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1041\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1049\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1058\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1065\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1075\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1085\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.1092\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1116\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1130\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1140\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1148\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1160\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1172\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1186\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.1195\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1353\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1358\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1363\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1370\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1377\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1384\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1390\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1397\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1460\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1463\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1467\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1472\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1478\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1482\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1489\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1493\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.1441\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.1441\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.44478899240493774\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58266.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31822.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.59 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58210.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58266.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.1441\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4448\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 2, Update 1] Average aggregated model validation loss = 0.4064129739999771\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 2, Update 1] Average training loss = 0.14573591947555542\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 2, Update 1] Average local model validation loss = 0.44730179011821747\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 3, Update 0] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.4368877112865448\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Completed 10 steps, current loss: 0.0855\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0863\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0871\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0882\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0889\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0895\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0902\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0911\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0842\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0857\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0862\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0868\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0874\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0882\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0887\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0894\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1078\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1084\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1092\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1099\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1108\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1113\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1121\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.1129\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1229\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1232\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1235\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1241\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1245\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1249\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1254\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.1259\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.1210\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.1210\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.4723173975944519\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58146.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31782.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.60 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58090.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58146.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.1210\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4723\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 3, Update 0] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.44539061188697815\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0859\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0867\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0880\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0891\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0899\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0907\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0927\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0936\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0927\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0941\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0948\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0954\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0963\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0970\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0981\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0987\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1174\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1178\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1183\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1188\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1197\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1203\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1210\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.1218\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1301\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1304\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1308\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1312\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1317\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1321\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1326\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1330\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.1277\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.1277\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.4785761833190918\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58106.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31914.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.60 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58050.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58106.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.1277\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4786\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 3, Update 0] Average aggregated model validation loss = 0.4411391615867615\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 3, Update 0] Average training loss = 0.12434957921504974\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 3, Update 0] Average local model validation loss = 0.47544679045677185\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 3, Update 1] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.47256991267204285\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Completed 10 steps, current loss: 0.0721\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0727\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0735\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0744\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0752\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0759\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0768\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0777\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0733\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0750\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0756\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0761\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0766\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0771\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0780\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0786\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.1022\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.1022\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.5270882248878479\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58346.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31804.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.61 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58290.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58346.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.1022\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5271\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 3, Update 1] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.48190006613731384\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0685\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0691\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0700\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0707\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0716\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0723\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0734\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0744\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0731\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0741\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0750\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0754\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0761\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0766\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0772\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0777\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0940\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0945\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0949\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0954\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0960\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0966\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0972\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0978\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1058\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1061\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1064\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1068\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1073\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1076\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1082\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.1085\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.1042\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.1042\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.528550386428833\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.80 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58384.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31838.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.61 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58328.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.80 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58384.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.1042\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5286\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 3, Update 1] Average aggregated model validation loss = 0.47723498940467834\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 3, Update 1] Average training loss = 0.10317578911781311\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 3, Update 1] Average local model validation loss = 0.5278193056583405\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 4, Update 0] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Completed 30 steps, current loss: 0.0727\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0730\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0734\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0739\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0744\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0748\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0754\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0759\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0824\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0826\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0828\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0831\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0832\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0835\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0838\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0841\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.0805\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.0805\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.5606555938720703\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.80 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58806.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.70 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31916.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58748.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.80 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58806.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.0805\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5607\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 4, Update 0] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.5261728763580322\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0576\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0584\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0589\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0594\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0602\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0610\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0617\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0622\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0613\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0622\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0627\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0631\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0637\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0642\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0649\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0653\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0763\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0766\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0770\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0773\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0778\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0783\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0788\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0791\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0871\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0873\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0875\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0878\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0882\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0885\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0891\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0893\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.0857\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.0857\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.5728362202644348\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58444.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.70 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 32006.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58388.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58444.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.0857\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5728\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 4, Update 0] Average aggregated model validation loss = 0.5190570056438446\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 4, Update 0] Average training loss = 0.08311298489570618\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 4, Update 0] Average local model validation loss = 0.5667459070682526\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 4, Update 1] Performing aggregated model validation for collaborator Portland with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.26it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.5445127487182617\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Completed 10 steps, current loss: 0.0533\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0536\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0543\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0551\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0557\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0563\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0570\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 10 steps, current loss: 0.0577\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0569\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0578\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0583\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0587\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0590\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0595\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0600\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 20 steps, current loss: 0.0604\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0669\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0672\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0675\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0677\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0682\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0684\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0688\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 30 steps, current loss: 0.0694\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0748\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0750\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0752\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0755\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0757\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0759\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0763\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 40 steps, current loss: 0.0766\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Completed 50 steps, current loss: 0.0734\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Portland] Training completed, average loss: 0.0734\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.5720435380935669\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57946.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.71 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31786.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57888.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33706.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 57946.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.0734\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5720\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 4, Update 1] Performing aggregated model validation for collaborator Seattle with 4bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.562188982963562\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Starting partial epoch training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:838: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", + " return fn(*args, **kwargs)\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0550\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0556\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0563\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0569\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0575\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0582\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0598\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 10 steps, current loss: 0.0602\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0583\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0590\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0594\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0597\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0605\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0610\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0614\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 20 steps, current loss: 0.0619\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0678\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0680\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0683\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0686\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0689\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0692\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0696\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 30 steps, current loss: 0.0699\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0732\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0734\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0736\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0738\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0741\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0743\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0746\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 40 steps, current loss: 0.0748\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Completed 50 steps, current loss: 0.0720\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Seattle] Training completed, average loss: 0.0720\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:13]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.6029046773910522\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.82 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58264.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 31523.71 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 31886.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33537.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58208.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 33606.82 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 58264.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: 0.0720\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.6029\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 4, Update 1] Average aggregated model validation loss = 0.5533508658409119\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 4, Update 1] Average training loss = 0.07266537845134735\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 4, Update 1] Average local model validation loss = 0.5874741077423096\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling end\n", + "\u001b[94mThis is the end of the flow for 4bit quantization\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "===== Final Metrics =====\n", + "\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mAverage Training Loss: 0.0727\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mFinal Aggregated Model Loss: 0.5534\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mFinal Local Model Loss: 0.5875\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "===== Metric History =====\n", + "\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mTraining Loss History:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 0: 0.3587\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 1: 0.2652\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 2: 0.2310\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 3: 0.1982\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 4: 0.1715\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 5: 0.1457\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 6: 0.1243\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 7: 0.1032\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 8: 0.0831\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 9: 0.0727\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Aggregated Model Loss History:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 0: 0.5808\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 1: 0.3908\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 2: 0.3858\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 3: 0.3888\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 4: 0.3977\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 5: 0.4064\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 6: 0.4411\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 7: 0.4772\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 8: 0.5191\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 9: 0.5534\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Local Model Loss History:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 0: 0.3954\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 1: 0.3943\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 2: 0.3972\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 3: 0.4079\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 4: 0.4185\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 5: 0.4473\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 6: 0.4754\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 7: 0.5278\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 8: 0.5667\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 9: 0.5875\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "===== Memory Usage Summary Across All Rounds =====\n", + "\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Portland Memory Usage Across Rounds/Updates (4bit) ====\n", + "\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_0_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 32696.12 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 57374.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 54745.21 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.3572\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.3917\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 30342.61 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 41548.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 43451.02 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 32388.53 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 57268.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 54745.21 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 32696.12 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 57374.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 54745.21 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_0_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33547.22 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 57726.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.2645\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.3904\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31364.12 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31586.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 54883.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33378.03 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 57670.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33547.22 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 57726.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_1_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33706.76 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 58286.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.2315\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.3928\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.66 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31766.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.57 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 58230.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33706.76 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 58286.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_1_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33706.76 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 58226.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.1995\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4019\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.66 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31748.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.58 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 58170.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33706.76 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 58226.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_2_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33706.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 58186.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.1710\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4138\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.67 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31838.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.59 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 58130.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33706.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 58186.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_2_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33706.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 57986.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.1473\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4498\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31762.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.59 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 57930.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33706.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 57986.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_3_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33706.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 58146.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.1210\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4723\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31782.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.60 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 58090.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33706.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 58146.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_3_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33706.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 58346.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.1022\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5271\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31804.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.61 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 58290.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33706.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 58346.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_4_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33706.80 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 58806.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.0805\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5607\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.70 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31916.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 58748.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33706.80 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 58806.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_4_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33706.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 57946.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.0734\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5720\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.71 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31786.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 57888.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33706.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 57946.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Seattle Memory Usage Across Rounds/Updates (4bit) ====\n", + "\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_0_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 21814.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 47222.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 54883.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.3601\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.3991\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 19625.16 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 30754.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 54883.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 21639.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 47114.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 54883.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 21814.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 47222.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 54883.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_0_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33447.23 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 57866.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.2659\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.3981\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31364.12 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31506.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33378.03 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 57810.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33447.23 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 57866.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55734.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_1_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33606.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 58166.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.2306\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4016\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.66 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31784.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.57 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 58110.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33606.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 58166.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.26 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_1_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33606.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 57886.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.1968\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4139\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.66 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31770.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.58 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 57830.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33606.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 57886.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.27 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_2_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33606.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 58206.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.1720\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4232\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.67 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31798.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.59 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 58150.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33606.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 58206.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_2_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33606.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 58266.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.1441\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4448\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31822.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.59 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 58210.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33606.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 58266.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_3_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33606.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 58106.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.1277\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4786\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31914.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.60 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 58050.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33606.79 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 58106.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.29 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_3_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33606.80 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 58384.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.1042\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5286\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31838.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.61 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 58328.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33606.80 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 58384.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_4_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33606.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 58444.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.0857\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5728\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.70 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 32006.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 58388.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33606.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 58444.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_4_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 33606.82 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 58264.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: 0.0720\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.6029\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 31523.71 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 31886.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 33537.62 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 58208.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 33606.82 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 58264.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "# Setup participants\n", + "aggregator = Aggregator()\n", + "collaborators = [\n", + " Collaborator(name=\"Portland\"),\n", + " Collaborator(name=\"Seattle\")\n", + "]\n", + "\n", + "# Assign data shards\n", + "for idx, colab in enumerate(collaborators):\n", + " colab.private_attributes = {\n", + " \"train_dataset\": train_dataset.shard(len(collaborators), idx),\n", + " \"eval_dataset\": eval_dataset.shard(len(collaborators), idx)\n", + " }\n", + "\n", + "# Run with 4-bit quantization\n", + "print(\"\\n=============== Running with 4-bit Quantization ===============\\n\")\n", + "bnb_config = bnb_config_4bit # Set active config to 4-bit\n", + "runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators)\n", + "flflow_4bit = FederatedFlow(model, rounds=5, quant_type=\"4bit\") # Reduce to 1 round\n", + "flflow_4bit.runtime = runtime\n", + "flflow_4bit.run()" + ] + }, + { + "cell_type": "markdown", + "id": "87c4865a", + "metadata": {}, + "source": [ + "## Run Federated Learning with 8-bit Quantization" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "93c60404", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cleared CUDA cache between runs\n", + "\n", + "=============== Running with 8-bit Quantization ===============\n", + "\n", + "Loading model with 8-bit quantization on CPU first...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|██████████| 6/6 [00:02<00:00, 2.05it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling start\n", + "\u001b[94mPerforming initialization for model with 8bit quantization\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mUsing 5 main rounds with partial round updates\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 0, Update 0] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.27it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.561655580997467\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + "
StepTraining Loss
100.297100
200.287600
300.288400
400.284300
500.282800
600.281400
700.292800
800.296100
900.276100
1000.277400
1100.283700
1200.288200
1300.296800
1400.301800
1500.285500
1600.308200
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.728600
40.559300
60.475600
80.319500
100.310800
120.300300
140.244800
160.375300
180.305800
200.279400
220.359400
240.257100

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.3953164219856262\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39210.98 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 59636.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 56253.51 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 36814.91 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38196.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39210.98 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 59636.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 56253.51 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39008.34 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42136.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 56253.51 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.3953\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 0, Update 0] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.26it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.5638197064399719\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:30, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.845500
40.502000
60.451700
80.422500
100.285700
120.259400
140.324700
160.300600
180.309400
200.332400
220.378600
240.347400

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.40347251296043396\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39317.46 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 59596.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 56364.86 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 36974.44 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38276.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 56364.86 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39317.46 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 59596.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 56364.86 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39149.93 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42176.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 56364.86 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4035\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 0, Update 0] Average aggregated model validation loss = 0.5627376437187195\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 0, Update 0] Average training loss = inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 0, Update 0] Average local model validation loss = 0.3993944674730301\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 0, Update 1] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.395594984292984\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [22/25 01:14 < 00:11, 0.27 it/s, Epoch 0.04/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.287200
40.248800
60.295900
80.238400
100.245100
120.248000
140.212000
160.306900
180.248200
200.230800

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.39317232370376587\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40105.33 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60902.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57210.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37772.41 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38798.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57164.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40105.33 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60902.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57210.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39841.12 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42898.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57210.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.3932\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 0, Update 1] Average aggregated model validation loss = 0.3971341550350189\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 0, Update 1] Average training loss = inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 0, Update 1] Average local model validation loss = 0.39462728798389435\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 1, Update 0] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.3897101879119873\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.237800
40.217600
60.244500
80.208000
100.210300
120.213500
140.190500
160.264600
180.209000
200.189600
220.251900
240.158800

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.40585511922836304\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40210.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61118.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57272.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38822.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57214.60 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40210.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61118.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57272.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39916.32 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42894.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57272.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4059\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 1, Update 0] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.3935319185256958\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.268500
40.193500
60.218900
80.268700
100.190400
120.189700
140.214800
160.211600
180.231400
200.240300
220.246100
240.177200

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.40515169501304626\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40215.71 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60930.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57319.97 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38916.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57272.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40215.71 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60930.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57319.97 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39949.17 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42986.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57319.97 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4052\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 1, Update 0] Average aggregated model validation loss = 0.39162105321884155\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 1, Update 0] Average training loss = inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 1, Update 0] Average local model validation loss = 0.40550340712070465\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 1, Update 1] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.3948669135570526\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.184200
40.176300
60.187500
80.161600
100.176600
120.178600
140.169600
160.213200
180.169800
200.152300
220.184800
240.121900

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.4210392236709595\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40210.16 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60316.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38794.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40210.16 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60316.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39911.82 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42890.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4210\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 1, Update 1] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.39981263875961304\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:30, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.211300
40.154100
60.163600
80.212400
100.163400
120.166300
140.176000
160.181400
180.182900
200.197100
220.201300
240.147700

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.4217308461666107\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40218.59 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61308.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38902.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40218.59 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61308.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39946.36 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42924.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4217\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 1, Update 1] Average aggregated model validation loss = 0.3973397761583328\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 1, Update 1] Average training loss = inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 1, Update 1] Average local model validation loss = 0.4213850349187851\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 2, Update 0] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.40616321563720703\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.128500
40.116900
60.109700
80.119700
100.154000
120.148100
140.156600
160.162500
180.128900
200.127000
220.123400
240.083500

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.5076923370361328\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40203.53 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61136.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38848.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40203.53 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61136.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39909.20 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42830.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5077\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 2, Update 0] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.41194236278533936\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.150600
40.110600
60.114500
80.168300
100.139400
120.133200
140.144500
160.158400
180.144300
200.149400
220.145400
240.108200

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.4701308310031891\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40210.40 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61008.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38938.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40210.40 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61008.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39941.88 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 43064.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.4701\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 2, Update 0] Average aggregated model validation loss = 0.4090527892112732\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 2, Update 0] Average training loss = inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 2, Update 0] Average local model validation loss = 0.48891158401966095\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 2, Update 1] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.4745386242866516\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.077900
40.073000
60.103700
80.090100
100.109100
120.110200
140.128100
160.140200
180.123900
200.105000
220.098300
240.070000

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.5099506974220276\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40192.70 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61154.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38856.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40192.70 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61154.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39908.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42890.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5100\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 2, Update 1] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.48276180028915405\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.086700
40.066700
60.083300
80.140100
100.135800
120.124500
140.128400
160.135900
180.132500
200.125100
220.109400
240.079100

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.5275096893310547\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40199.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60928.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38960.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40199.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60928.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39946.38 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42922.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5275\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 2, Update 1] Average aggregated model validation loss = 0.47865021228790283\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 2, Update 1] Average training loss = inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 2, Update 1] Average local model validation loss = 0.5187301933765411\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 3, Update 0] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.5035024285316467\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.049000
40.054900
60.076800
80.087800
100.106500
120.091100
140.111600
160.129900
180.105400
200.090700
220.099800
240.083200

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.5614020824432373\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40187.96 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60596.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38822.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40187.96 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60596.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39911.37 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42870.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5614\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 3, Update 0] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.23it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.5139071941375732\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:30, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.055200
40.049700
60.085100
80.116400
100.106000
120.111700
140.102500
160.114700
180.106000
200.107500
220.085900
240.080100

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Local evaluation loss: 0.5329421162605286\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40194.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61068.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38914.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40194.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61068.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39942.97 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42944.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5329\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling join\n", + "\u001b[94m[Round 3, Update 0] Average aggregated model validation loss = 0.50870481133461\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 3, Update 0] Average training loss = inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 3, Update 0] Average local model validation loss = 0.5471720993518829\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 3, Update 1] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.5240783095359802\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
20.041300
40.044000
60.060200
80.074900
100.080700
120.078300
140.096000
160.111400
180.093000
200.079100
220.084700
240.066400

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [13/13 00:12]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Local evaluation loss: 0.5784252882003784\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40185.57 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60476.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Memory Usage by Stage:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38820.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40185.57 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60476.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39910.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42950.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Performance Metrics:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5784\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", + "\u001b[0mShould transfer from local_model_validation to join\n", + "\n", + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 3, Update 1] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [13/13 00:11]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.5348654389381409\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
StepTraining Loss
1700.29210020.047700
1800.30560040.042700
1900.29630060.064300
2000.30690080.092000
2100.321900100.089600
2200.309400120.105100
140.101900
160.093900
180.090200
200.093600
220.081300
240.066200

" @@ -2035,12 +10152,35 @@ "metadata": {}, "output_type": "display_data" }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "\n", - "Calling local_model_validation\n" + "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" ] }, { @@ -2057,6 +10197,27 @@ " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", "\u001b[0m" ] }, @@ -2067,7 +10228,7 @@ "

\n", " \n", " \n", - " [13/13 00:13]\n", + " [13/13 00:11]\n", "
\n", " " ], @@ -2082,141 +10243,68 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[94m[Seattle] Local evaluation loss: 0.5685837864875793\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", - "==== Memory Usage Report for Seattle (4bit) ====\u001b[0m\u001b[94m\n", + "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 35535.52 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 59804.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 57918.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40193.50 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60808.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", "Memory Usage by Stage:\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 33255.83 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 33424.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 57918.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38936.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 35322.87 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 59804.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 57918.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40193.50 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60808.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 35535.52 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 41528.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 57918.78 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39946.86 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42922.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", "Performance Metrics:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Training Loss: 0.2942\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Evaluation Loss: 0.3126\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5686\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", "\u001b[0mShould transfer from local_model_validation to join\n", "\n", "Calling join\n", - "\u001b[94mAverage aggregated model validation values = 0.3085825592279434\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94mAverage training loss = 0.29453011579388616\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94mAverage local model validation values = 0.30557093024253845\u001b[0m\u001b[94m\n", + "\u001b[94m[Round 3, Update 1] Average aggregated model validation loss = 0.5294718742370605\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 3, Update 1] Average training loss = inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 3, Update 1] Average local model validation loss = 0.5735045373439789\u001b[0m\u001b[94m\n", "\u001b[0m\n", - "Calling end\n", - "\u001b[94mThis is the end of the flow for 4bit quantization\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m\n", - "===== Final Metrics =====\n", - "\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94mAverage Training Loss: 0.2945\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94mFinal Aggregated Model Loss: 0.3086\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94mFinal Local Model Loss: 0.3056\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m\n", - "===== Memory Usage Summary Across All Rounds =====\n", - "\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m\n", - "==== Portland Memory Usage Across Rounds (4bit) ====\n", - "\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m round_0:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m peak_allocated: 35323.02 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m peak_reserved: 60420.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m peak_max_allocated: 57461.62 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m training_loss: 0.3295\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m eval_loss: 0.3029\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m model_load_allocated: 32979.33 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m model_load_reserved: 45302.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m model_load_max_allocated: 48011.95 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m before_training_allocated: 35078.37 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m before_training_reserved: 60420.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m before_training_max_allocated: 57461.62 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m after_training_allocated: 35323.02 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m after_training_reserved: 41244.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m after_training_max_allocated: 57461.62 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m round_1:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m peak_allocated: 35535.52 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m peak_reserved: 59824.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m peak_max_allocated: 57706.28 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m training_loss: 0.2949\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m eval_loss: 0.2986\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m model_load_allocated: 33255.83 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m model_load_reserved: 33440.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m model_load_max_allocated: 57706.28 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m before_training_allocated: 35322.87 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m before_training_reserved: 59824.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m before_training_max_allocated: 57706.28 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m after_training_allocated: 35535.52 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m after_training_reserved: 41948.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m after_training_max_allocated: 57706.28 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m\n", - "==== Seattle Memory Usage Across Rounds (4bit) ====\n", - "\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m round_0:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m peak_allocated: 23170.02 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m peak_reserved: 48256.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m peak_max_allocated: 57706.28 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m training_loss: 0.3287\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m eval_loss: 0.3178\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m model_load_allocated: 20890.32 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m model_load_reserved: 33280.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m model_load_max_allocated: 57706.28 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m before_training_allocated: 22957.36 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m before_training_reserved: 48256.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m before_training_max_allocated: 57706.28 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m after_training_allocated: 23170.02 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m after_training_reserved: 29538.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m after_training_max_allocated: 57706.28 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m round_1:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m peak_allocated: 35535.52 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m peak_reserved: 59804.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m peak_max_allocated: 57918.78 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m training_loss: 0.2942\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m eval_loss: 0.3126\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m model_load_allocated: 33255.83 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m model_load_reserved: 33424.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m model_load_max_allocated: 57918.78 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m before_training_allocated: 35322.87 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m before_training_reserved: 59804.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m before_training_max_allocated: 57918.78 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m after_training_allocated: 35535.52 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m after_training_reserved: 41528.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m after_training_max_allocated: 57918.78 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", - "\u001b[0mCleared CUDA cache between runs\n", - "\n", - "=============== Running with 8-bit Quantization ===============\n", - "\n" + "Calling aggregated_model_validation\n", + "\u001b[94m[Round 4, Update 0] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n", + "\u001b[0m" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Loading checkpoint shards: 100%|██████████| 6/6 [00:04<00:00, 1.36it/s]\n" + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.23it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\n", + "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", + " warnings.warn(message, FutureWarning)\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\n", - "Calling start\n", - "\u001b[94mPerforming initialization for model with 8bit quantization\u001b[0m\u001b[94m\n", - "\u001b[0m\n", - "Calling aggregated_model_validation\n", - "\u001b[94mPerforming aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n", + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", "\u001b[0m" ] }, @@ -2224,16 +10312,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.30it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", - "\n", - "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", - " warnings.warn(message, FutureWarning)\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", @@ -2248,8 +10327,8 @@ "\n", "
\n", " \n", - " \n", - " [12/12 00:11]\n", + " \n", + " [13/13 00:11]\n", "
\n", " " ], @@ -2260,30 +10339,37 @@ "metadata": {}, "output_type": "display_data" }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[94mPortland value of 0.5662918090820312\u001b[0m\u001b[94m\n", - "\u001b[0m\n", - "Calling train\n" - ] - }, { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", "\n", "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", " warnings.warn(message, FutureWarning)\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.5471141934394836\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", @@ -2296,8 +10382,8 @@ "\n", "
\n", " \n", - " \n", - " [229/229 13:26, Epoch 1/1]\n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", "
\n", " \n", " \n", @@ -2308,92 +10394,52 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", "
100.494400
200.367000
300.331100
400.342600
500.32900020.033400
600.32660040.040500
700.31560060.064200
800.32040080.065800
900.316400
1000.323500
1100.297700
1200.309000
1300.308100
1400.315500
1500.303500100.077300
1600.307800120.071600
1700.320400140.080400
1800.304000160.088800
1900.314900180.082500
2000.305600200.062700
2100.303100220.066100
2200.306500240.056600

" @@ -2417,6 +10463,14 @@ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", "\u001b[0m" ] }, @@ -2424,7 +10478,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "\n", + "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\n", "Calling local_model_validation\n" ] }, @@ -2442,6 +10497,9 @@ " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", @@ -2455,8 +10513,8 @@ "\n", "

\n", " \n", - " \n", - " [12/12 00:11]\n", + " \n", + " [13/13 00:12]\n", "
\n", " " ], @@ -2471,36 +10529,37 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[94m[Portland] Local evaluation loss: 0.5927156209945679\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 75278.13 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 93310.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 92214.55 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40181.67 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60874.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", "Memory Usage by Stage:\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 72811.06 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 91120.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 91914.16 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38838.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 75278.13 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 93310.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 92214.55 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40181.67 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60874.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 75250.57 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 83158.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 92214.55 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39912.48 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42850.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", "Performance Metrics:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Training Loss: 0.3243\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Evaluation Loss: 0.2989\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5927\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", "\u001b[0mShould transfer from local_model_validation to join\n", "\n", "Calling aggregated_model_validation\n", - "\u001b[94mPerforming aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n", + "\u001b[94m[Round 4, Update 0] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n", "\u001b[0m" ] }, @@ -2508,7 +10567,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.28it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.24it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", "\n", "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", @@ -2519,6 +10578,23 @@ " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", @@ -2544,30 +10620,37 @@ "metadata": {}, "output_type": "display_data" }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[94mSeattle value of 0.5757399201393127\u001b[0m\u001b[94m\n", - "\u001b[0m\n", - "Calling train\n" - ] - }, { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", "\n", "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", " warnings.warn(message, FutureWarning)\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.5571405291557312\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", @@ -2580,8 +10663,8 @@ "\n", "
\n", " \n", - " \n", - " [229/229 13:29, Epoch 0/1]\n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", "
\n", " \n", " \n", @@ -2592,92 +10675,52 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", "
100.481300
200.365600
300.344500
400.331200
500.320000
600.314700
700.326100
800.331600
900.306900
1000.30480020.035800
1100.30920040.037800
1200.30830060.059100
1300.31350080.089500
1400.318300
1500.299200100.071900
1600.318800120.084300
1700.300500140.088000
1800.310100160.095700
1900.299400180.074300
2000.310000200.082400
2100.323100220.071100
2200.310700240.058900

" @@ -2701,6 +10744,14 @@ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", "\u001b[0m" ] }, @@ -2708,8 +10759,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "\n", - "Calling local_model_validation\n" + "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" ] }, { @@ -2726,6 +10777,23 @@ " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", @@ -2755,41 +10823,42 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[94m[Seattle] Local evaluation loss: 0.5642604827880859\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 58276.76 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 83620.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 92551.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40189.73 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61506.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", "Memory Usage by Stage:\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 55775.01 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 74152.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 92551.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38928.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 58145.08 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 83620.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 92551.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40189.73 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 61506.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 58276.76 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 65726.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 92551.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39944.91 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 43022.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", "Performance Metrics:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Training Loss: 0.3242\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Evaluation Loss: 0.3134\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5643\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", "\u001b[0mShould transfer from local_model_validation to join\n", "\n", "Calling join\n", - "\u001b[94mAverage aggregated model validation values = 0.571015864610672\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94mAverage training loss = 0.3242545044578319\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94mAverage local model validation values = 0.30610978603363037\u001b[0m\u001b[94m\n", + "\u001b[94m[Round 4, Update 0] Average aggregated model validation loss = 0.5521273612976074\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 4, Update 0] Average training loss = inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 4, Update 0] Average local model validation loss = 0.5784880518913269\u001b[0m\u001b[94m\n", "\u001b[0m\n", "Calling aggregated_model_validation\n", - "\u001b[94mPerforming aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n", + "\u001b[94m[Round 4, Update 1] Performing aggregated model validation for collaborator Portland with 8bit\u001b[0m\u001b[94m\n", "\u001b[0m" ] }, @@ -2797,7 +10866,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.28it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.25it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", "\n", "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", @@ -2808,6 +10877,23 @@ " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Portland] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", @@ -2821,8 +10907,8 @@ "\n", "

\n", " \n", - " \n", - " [12/12 00:11]\n", + " \n", + " [13/13 00:11]\n", "
\n", " " ], @@ -2833,30 +10919,37 @@ "metadata": {}, "output_type": "display_data" }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[94mPortland value of 0.296934574842453\u001b[0m\u001b[94m\n", - "\u001b[0m\n", - "Calling train\n" - ] - }, { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", "\n", "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", " warnings.warn(message, FutureWarning)\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mPortland evaluation loss: 0.5503019094467163\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Portland] Restoring optimizer state\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", @@ -2869,8 +10962,8 @@ "\n", "
\n", " \n", - " \n", - " [229/229 13:24, Epoch 1/1]\n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", "
\n", " \n", " \n", @@ -2881,92 +10974,52 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", "
100.295100
200.290700
300.277600
400.291800
500.286200
600.288300
700.27760020.033700
800.28600040.033800
900.28520060.052200
1000.29540080.071900
1100.272500
1200.288300
1300.288100
1400.298300
1500.287800100.063400
1600.295400120.057900
1700.309200140.065800
1800.293700160.068000
1900.308100180.085900
2000.299100200.057100
2100.296900220.056000
2200.301700240.051500

" @@ -2990,6 +11043,14 @@ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", "\u001b[0m" ] }, @@ -2997,8 +11058,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "\n", - "Calling local_model_validation\n" + "\u001b[94m[Portland] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" ] }, { @@ -3015,6 +11076,23 @@ " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", @@ -3028,8 +11106,8 @@ "\n", "

\n", " \n", - " \n", - " [12/12 00:11]\n", + " \n", + " [13/13 00:12]\n", "
\n", " " ], @@ -3044,36 +11122,37 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", + "\u001b[94m[Portland] Local evaluation loss: 0.5799501538276672\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Portland\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", "==== Memory Usage Report for Portland (8bit) ====\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 75488.79 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 93552.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 92552.04 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40182.25 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60476.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", "Memory Usage by Stage:\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 73023.56 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 74272.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 92551.30 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38820.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 75488.79 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 93552.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 92552.04 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40182.25 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60476.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 75466.38 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 82944.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 92552.04 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39911.18 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 42930.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", "Performance Metrics:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Training Loss: 0.2914\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Evaluation Loss: 0.2939\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5800\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", "\u001b[0mShould transfer from local_model_validation to join\n", "\n", "Calling aggregated_model_validation\n", - "\u001b[94mPerforming aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n", + "\u001b[94m[Round 4, Update 1] Performing aggregated model validation for collaborator Seattle with 8bit\u001b[0m\u001b[94m\n", "\u001b[0m" ] }, @@ -3081,7 +11160,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.28it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", + "Loading checkpoint shards: 100%|##########| 6/6 [00:04<00:00, 1.23it/s]\u001b[0m\u001b[94m\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", "\n", "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", @@ -3092,6 +11171,23 @@ " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94m[Seattle] Training with 50 steps\u001b[0m\u001b[94m\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", @@ -3117,140 +11213,107 @@ "metadata": {}, "output_type": "display_data" }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[94mSeattle value of 0.31116044521331787\u001b[0m\u001b[94m\n", - "\u001b[0m\n", - "Calling train\n" - ] - }, { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field, packing. Will not be supported from version '0.13.0'.\n", + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '0.13.0'.\n", "\n", "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", " warnings.warn(message, FutureWarning)\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", - " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", - " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", - "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", - " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[94mSeattle evaluation loss: 0.5621325373649597\u001b[0m\u001b[94m\n", + "\u001b[0m\n", + "Calling train\n", + "\u001b[94m[Seattle] Restoring optimizer state\u001b[0m\u001b[94m\n", "\u001b[0m" ] }, { - "data": { - "text/html": [ - "\n", - "
\n", - " \n", - " \n", - " [229/229 13:28, Epoch 0/1]\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [25/25 01:29, Epoch 0/1]\n", + "
\n", + "
StepTraining Loss
100.294400
200.284700
300.289000
400.282800
500.279500
600.277900
700.289600
800.294600
900.275700
1000.276200
\n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", "
StepTraining Loss
1100.28060020.038600
1200.28470040.032600
1300.29260060.056700
1400.29780080.066900
1500.281700100.062500
1600.305000120.060700
1700.288500140.078300
1800.301900160.075200
1900.292400180.063500
2000.303000200.071600
2100.317300220.064000
2200.305900240.045200

" @@ -3274,6 +11337,14 @@ " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", "\u001b[0m" ] }, @@ -3281,8 +11352,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "\n", - "Calling local_model_validation\n" + "\u001b[94m[Seattle] Training completed, average loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m" ] }, { @@ -3299,6 +11370,23 @@ " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", + "\u001b[0mmax_steps is given, it will override any value given in num_train_epochs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Calling local_model_validation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:428: UserWarning: You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n", + " warnings.warn(\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", "\u001b[0m\u001b[94m/home/azureuser/env_name/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n", @@ -3328,195 +11416,526 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", + "\u001b[94m[Seattle] Local evaluation loss: 0.5817354321479797\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mDoing local model validation for collaborator Seattle\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", "==== Memory Usage Report for Seattle (8bit) ====\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94mPeak Memory Usage:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 75529.57 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 93476.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 92767.32 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40191.22 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60688.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", "Memory Usage by Stage:\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m model_load:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 73023.56 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 74242.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 92767.32 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 38938.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m before_training:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 75388.56 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 93476.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 92767.32 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 40191.22 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 60688.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m after_training:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Allocated: 75529.57 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Reserved: 83044.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Max Allocated: 92767.32 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Allocated: 39944.87 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Reserved: 43002.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Max Allocated: 57321.08 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", "Performance Metrics:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Training Loss: 0.2912\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m Evaluation Loss: 0.3080\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Evaluation Loss: 0.5817\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", "\u001b[0mShould transfer from local_model_validation to join\n", "\n", "Calling join\n", - "\u001b[94mAverage aggregated model validation values = 0.30404751002788544\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94mAverage training loss = 0.29131152119699005\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94mAverage local model validation values = 0.3009362369775772\u001b[0m\u001b[94m\n", + "\u001b[94m[Round 4, Update 1] Average aggregated model validation loss = 0.556217223405838\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 4, Update 1] Average training loss = inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m[Round 4, Update 1] Average local model validation loss = 0.5808427929878235\u001b[0m\u001b[94m\n", "\u001b[0m\n", "Calling end\n", "\u001b[94mThis is the end of the flow for 8bit quantization\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", "===== Final Metrics =====\n", "\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94mAverage Training Loss: 0.2913\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94mFinal Aggregated Model Loss: 0.3040\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94mFinal Local Model Loss: 0.3009\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mAverage Training Loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mFinal Aggregated Model Loss: 0.5562\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mFinal Local Model Loss: 0.5808\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "===== Metric History =====\n", + "\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94mTraining Loss History:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 0: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 1: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 2: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 3: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 4: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 5: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 6: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 7: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 8: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 9: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Aggregated Model Loss History:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 0: 0.5627\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 1: 0.3971\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 2: 0.3916\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 3: 0.3973\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 4: 0.4091\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 5: 0.4787\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 6: 0.5087\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 7: 0.5295\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 8: 0.5521\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 9: 0.5562\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m\n", + "Local Model Loss History:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 0: 0.3994\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 1: 0.3946\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 2: 0.4055\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 3: 0.4214\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 4: 0.4889\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 5: 0.5187\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 6: 0.5472\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 7: 0.5735\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 8: 0.5785\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m Update 9: 0.5808\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", "===== Memory Usage Summary Across All Rounds =====\n", "\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", - "==== Portland Memory Usage Across Rounds (8bit) ====\n", - "\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m round_0:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m peak_allocated: 75278.13 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m peak_reserved: 93310.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m peak_max_allocated: 92214.55 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m training_loss: 0.3243\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m eval_loss: 0.2989\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m model_load_allocated: 72811.06 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m model_load_reserved: 91120.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m model_load_max_allocated: 91914.16 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m before_training_allocated: 75278.13 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m before_training_reserved: 93310.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m before_training_max_allocated: 92214.55 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m after_training_allocated: 75250.57 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m after_training_reserved: 83158.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m after_training_max_allocated: 92214.55 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m round_1:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m peak_allocated: 75488.79 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m peak_reserved: 93552.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m peak_max_allocated: 92552.04 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m training_loss: 0.2914\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m eval_loss: 0.2939\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m model_load_allocated: 73023.56 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m model_load_reserved: 74272.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m model_load_max_allocated: 92551.30 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m before_training_allocated: 75488.79 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m before_training_reserved: 93552.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m before_training_max_allocated: 92552.04 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m after_training_allocated: 75466.38 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m after_training_reserved: 82944.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m after_training_max_allocated: 92552.04 MB\u001b[0m\u001b[94m\n", + "==== Portland Memory Usage Across Rounds/Updates (8bit) ====\n", + "\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_0_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 39210.98 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 59636.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 56253.51 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.3953\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 36814.91 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38196.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 55894.31 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 39210.98 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 59636.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 56253.51 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39008.34 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42136.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 56253.51 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_0_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40100.06 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 60872.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57164.02 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.3961\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37719.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38654.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 56518.59 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40100.06 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 60872.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57164.02 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39808.58 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42766.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57164.02 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_1_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40210.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 61118.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57272.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4059\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38822.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57214.60 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40210.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 61118.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57272.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39916.32 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42894.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57272.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_1_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40210.16 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 60316.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4210\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38794.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40210.16 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 60316.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39911.82 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42890.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_2_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40203.53 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 61136.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5077\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38848.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40203.53 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 61136.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39909.20 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42830.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_2_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40192.70 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 61154.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5100\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38856.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40192.70 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 61154.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39908.28 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42890.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_3_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40187.96 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 60596.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5614\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38822.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40187.96 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 60596.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39911.37 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42870.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_3_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40185.57 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 60476.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5784\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38820.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40185.57 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 60476.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39910.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42950.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_4_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40181.67 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 60874.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5927\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38838.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40181.67 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 60874.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39912.48 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42850.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_4_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40182.25 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 60476.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5800\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37825.69 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38820.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40182.25 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 60476.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39911.18 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42930.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m\n", - "==== Seattle Memory Usage Across Rounds (8bit) ====\n", - "\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m round_0:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m peak_allocated: 58276.76 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m peak_reserved: 83620.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m peak_max_allocated: 92551.30 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m training_loss: 0.3242\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m eval_loss: 0.3134\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m model_load_allocated: 55775.01 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m model_load_reserved: 74152.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m model_load_max_allocated: 92551.30 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m before_training_allocated: 58145.08 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m before_training_reserved: 83620.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m before_training_max_allocated: 92551.30 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m after_training_allocated: 58276.76 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m after_training_reserved: 65726.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m after_training_max_allocated: 92551.30 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m round_1:\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m peak_allocated: 75529.57 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m peak_reserved: 93476.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m peak_max_allocated: 92767.32 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m training_loss: 0.2912\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m eval_loss: 0.3080\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m model_load_allocated: 73023.56 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m model_load_reserved: 74242.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m model_load_max_allocated: 92767.32 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m before_training_allocated: 75388.56 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m before_training_reserved: 93476.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m before_training_max_allocated: 92767.32 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m after_training_allocated: 75529.57 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m after_training_reserved: 83044.00 MB\u001b[0m\u001b[94m\n", - "\u001b[0m\u001b[94m after_training_max_allocated: 92767.32 MB\u001b[0m\u001b[94m\n", + "==== Seattle Memory Usage Across Rounds/Updates (8bit) ====\n", + "\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_0_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 39317.46 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 59596.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 56364.86 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4035\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 36974.44 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38276.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 56364.86 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 39317.46 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 59596.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 56364.86 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39149.93 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42176.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 56364.86 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_0_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40105.33 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 60902.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57210.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.3932\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37772.41 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38798.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57164.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40105.33 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 60902.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57210.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39841.12 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42898.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57210.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_1_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40215.71 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 60930.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57319.97 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4052\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38916.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57272.77 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40215.71 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 60930.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57319.97 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39949.17 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42986.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57319.97 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_1_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40218.59 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 61308.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4217\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38902.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40218.59 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 61308.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39946.36 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42924.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_2_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40210.40 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 61008.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.4701\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38938.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40210.40 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 61008.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39941.88 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 43064.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_2_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40199.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 60928.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5275\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38960.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40199.68 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 60928.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39946.38 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42922.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_3_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40194.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 61068.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5329\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38914.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40194.72 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 61068.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39942.97 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42944.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_3_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40193.50 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 60808.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5686\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38936.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40193.50 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 60808.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39946.86 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 42922.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_4_update_0:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40189.73 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 61506.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5643\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38928.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40189.73 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 61506.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39944.91 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 43022.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m round_4_update_1:\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_allocated: 40191.22 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_reserved: 60688.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m peak_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m training_loss: inf\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m eval_loss: 0.5817\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_allocated: 37878.81 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_reserved: 38938.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m model_load_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_allocated: 40191.22 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_reserved: 60688.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m before_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_allocated: 39944.87 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_reserved: 43002.00 MB\u001b[0m\u001b[94m\n", + "\u001b[0m\u001b[94m after_training_max_allocated: 57321.08 MB\u001b[0m\u001b[94m\n", "\u001b[0m\u001b[94m--------------------------------------------------\u001b[0m\u001b[94m\n", "\u001b[0m" ] } ], "source": [ - "# Setup participants\n", - "aggregator = Aggregator()\n", - "collaborators = [\n", - " Collaborator(name=\"Portland\"),\n", - " Collaborator(name=\"Seattle\")\n", - "]\n", - "\n", - "# Assign data shards\n", - "for idx, colab in enumerate(collaborators):\n", - " colab.private_attributes = {\n", - " \"train_dataset\": train_dataset.shard(len(collaborators), idx),\n", - " \"eval_dataset\": eval_dataset.shard(len(collaborators), idx)\n", - " }\n", - "\n", - "# First run with 4-bit quantization\n", - "print(\"\\n=============== Running with 4-bit Quantization ===============\\n\")\n", - "bnb_config = bnb_config_4bit # Set active config to 4-bit\n", - "runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators)\n", - "flflow_4bit = FederatedFlow(model, rounds=2, quant_type=\"4bit\")\n", - "flflow_4bit.runtime = runtime\n", - "flflow_4bit.run()\n", - "\n", "# Clean up CUDA cache between runs\n", "if torch.cuda.is_available():\n", " torch.cuda.empty_cache()\n", " print(\"Cleared CUDA cache between runs\")\n", "\n", - "# Then run with 8-bit quantization\n", + "import gc\n", + "import time\n", + "\n", + "# Force garbage collection\n", + "gc.collect()\n", + "time.sleep(5) # Give system time to free memory\n", + "\n", + "# Run with 8-bit quantization\n", "print(\"\\n=============== Running with 8-bit Quantization ===============\\n\")\n", "bnb_config = bnb_config_8bit # Set active config to 8-bit\n", - "# Reload the model with 8-bit quantization\n", + "\n", + "# Force model to be loaded on CPU first for 8-bit quantization\n", + "print(\"Loading model with 8-bit quantization on CPU first...\")\n", "model_8bit = AutoModelForCausalLM.from_pretrained(\n", " model_name,\n", + " device_map=\"cpu\", # Start on CPU to avoid OOM\n", " quantization_config=bnb_config_8bit,\n", - " device_map=\"auto\",\n", - " trust_remote_code=True\n", + " trust_remote_code=True,\n", + " torch_dtype=torch.float32, # Use float32 for CPU\n", + " low_cpu_mem_usage=True\n", ")\n", "model_8bit = prepare_model_for_kbit_training(model_8bit)\n", "model_8bit = get_peft_model(model_8bit, peft_config)\n", "\n", + "# Use only one round and one collaborator for 8-bit to save memory\n", "runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators)\n", - "flflow_8bit = FederatedFlow(model_8bit, rounds=2, quant_type=\"8bit\")\n", + "flflow_8bit = FederatedFlow(model_8bit, rounds=5, quant_type=\"8bit\")\n", "flflow_8bit.runtime = runtime\n", "flflow_8bit.run()" ] }, { "cell_type": "markdown", - "id": "6e1268ae", + "id": "ebe541a4", + "metadata": {}, + "source": [ + "## Visualize Memory and Performance Metrics" + ] + }, + { + "cell_type": "markdown", + "id": "4718aa9f", "metadata": {}, "source": [ - "## Compare VRAM Usage and Training Loss Between 4-bit and 8-bit" + "Now that we've run our federated training with both 4-bit and 8-bit quantization, let's visualize the memory usage and performance metrics to understand the tradeoffs between these approaches." ] }, { "cell_type": "code", - "execution_count": 12, - "id": "6f5caae6", + "execution_count": 15, + "id": "d84d3daa-7520-4b3f-a1d6-ae7cebec58e7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\n", - "==== Memory Usage Comparison: 4-bit vs 8-bit ====\n", - "\n" + "Converting any CUDA tensors to CPU for visualization...\n", + "Conversion complete. Ready for visualization.\n" ] - }, + } + ], + "source": [ + "# Pre-process tensors to fix CUDA to CPU conversion issues\n", + "def tensor_to_float(val):\n", + " if val is None:\n", + " return None\n", + " if isinstance(val, torch.Tensor):\n", + " return val.detach().cpu().float().numpy().item()\n", + " return val\n", + "\n", + "# Convert all tensors in both flow objects\n", + "print(\"Converting any CUDA tensors to CPU for visualization...\")\n", + "\n", + "# Convert the history lists\n", + "flflow_4bit.average_loss_history = [tensor_to_float(x) for x in flflow_4bit.average_loss_history]\n", + "flflow_4bit.agg_model_loss_history = [tensor_to_float(x) for x in flflow_4bit.agg_model_loss_history]\n", + "flflow_4bit.local_model_loss_history = [tensor_to_float(x) for x in flflow_4bit.local_model_loss_history]\n", + "\n", + "flflow_8bit.average_loss_history = [tensor_to_float(x) for x in flflow_8bit.average_loss_history]\n", + "flflow_8bit.agg_model_loss_history = [tensor_to_float(x) for x in flflow_8bit.agg_model_loss_history]\n", + "flflow_8bit.local_model_loss_history = [tensor_to_float(x) for x in flflow_8bit.local_model_loss_history]\n", + "\n", + "# Convert current values\n", + "flflow_4bit.average_loss = tensor_to_float(flflow_4bit.average_loss)\n", + "flflow_4bit.aggregated_model_accuracy = tensor_to_float(flflow_4bit.aggregated_model_accuracy)\n", + "flflow_4bit.local_model_accuracy = tensor_to_float(flflow_4bit.local_model_accuracy)\n", + "\n", + "flflow_8bit.average_loss = tensor_to_float(flflow_8bit.average_loss)\n", + "flflow_8bit.aggregated_model_accuracy = tensor_to_float(flflow_8bit.aggregated_model_accuracy)\n", + "flflow_8bit.local_model_accuracy = tensor_to_float(flflow_8bit.local_model_accuracy)\n", + "\n", + "# Convert tensors in memory stats\n", + "for collab, rounds_data in flflow_4bit.all_memory_stats.items():\n", + " for round_name, stats in rounds_data.items():\n", + " if \"training_loss\" in stats and isinstance(stats[\"training_loss\"], torch.Tensor):\n", + " stats[\"training_loss\"] = tensor_to_float(stats[\"training_loss\"])\n", + " if \"eval_loss\" in stats and isinstance(stats[\"eval_loss\"], torch.Tensor):\n", + " stats[\"eval_loss\"] = tensor_to_float(stats[\"eval_loss\"])\n", + "\n", + "for collab, rounds_data in flflow_8bit.all_memory_stats.items():\n", + " for round_name, stats in rounds_data.items():\n", + " if \"training_loss\" in stats and isinstance(stats[\"training_loss\"], torch.Tensor):\n", + " stats[\"training_loss\"] = tensor_to_float(stats[\"training_loss\"])\n", + " if \"eval_loss\" in stats and isinstance(stats[\"eval_loss\"], torch.Tensor):\n", + " stats[\"eval_loss\"] = tensor_to_float(stats[\"eval_loss\"])\n", + "\n", + "print(\"Conversion complete. Ready for visualization.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "e6c8db6d", + "metadata": {}, + "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "

" ] @@ -3532,614 +11951,125 @@ "==== Performance Summary ====\n", "\n", "Memory Usage Comparison:\n", - " 4-bit Avg: 57698.24 MB\n", - " 8-bit Avg: 92521.30 MB\n", - " Difference: 60.4% more memory with 8-bit\n", + " 4-bit Avg: 55770.34 MB\n", + " 8-bit Avg: 57204.05 MB\n", + " Difference: 2.6% more memory with 8-bit\n", "\n", "Evaluation Loss Comparison:\n", - " 4-bit Avg: 0.3080\n", - " 8-bit Avg: 0.3035\n", - " Difference: 1.4% lower loss with 8-bit\n", + " 4-bit Avg: 0.4618\n", + " 8-bit Avg: 0.4909\n", + " Difference: 6.3% higher loss with 8-bit\n", "\n", - "Efficiency Analysis: 4-bit provides better memory efficiency with lower loss\n" + "Efficiency Analysis: 8-bit provides more efficiency memory usage relative to loss\n" ] } ], "source": [ - "# Plot memory metrics comparison between 4-bit and 8-bit\n", - "print(\"\\n==== Memory Usage Comparison: 4-bit vs 8-bit ====\\n\")\n", + "# Visualize memory usage across quantization methods\n", "plot_memory_metrics(flflow_4bit, flflow_8bit)" ] }, - { - "cell_type": "markdown", - "id": "19dfbe72", - "metadata": {}, - "source": [ - "## Analysis and Conclusions" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "5da1c53e", - "metadata": {}, - "outputs": [], - "source": [ - "# Utility functions to monitor GPU memory usage\n", - "def get_gpu_memory_info():\n", - " \"\"\"Get current GPU memory usage in MB\"\"\"\n", - " if torch.cuda.is_available():\n", - " # Get the current device\n", - " device = torch.cuda.current_device()\n", - " # Get memory information\n", - " total_memory = torch.cuda.get_device_properties(device).total_memory / 1024**2 # MB\n", - " allocated_memory = torch.cuda.memory_allocated(device) / 1024**2 # MB\n", - " reserved_memory = torch.cuda.memory_reserved(device) / 1024**2 # MB\n", - " free_memory = total_memory - allocated_memory\n", - " \n", - " return {\n", - " \"device\": device,\n", - " \"total_memory_mb\": total_memory,\n", - " \"allocated_memory_mb\": allocated_memory,\n", - " \"reserved_memory_mb\": reserved_memory,\n", - " \"free_memory_mb\": free_memory\n", - " }\n", - " else:\n", - " return {\"device\": \"cpu\", \"error\": \"CUDA not available\"}\n", - "\n", - "class MemoryTracker:\n", - " \"\"\"Track memory usage across training phases\"\"\"\n", - " def __init__(self, collaborator_name, quant_type=\"4bit\"):\n", - " self.collaborator_name = collaborator_name\n", - " self.quant_type = quant_type # Track whether this is 4bit or 8bit\n", - " self.memory_log = {\n", - " \"model_load\": None,\n", - " \"after_trainer_init\": None,\n", - " \"before_training\": None,\n", - " \"after_training\": None,\n", - " \"peak_memory\": None\n", - " }\n", - " self.peak_memory = 0\n", - " self.reset_peak()\n", - " \n", - " def update_peak(self):\n", - " \"\"\"Update peak memory usage\"\"\"\n", - " if torch.cuda.is_available():\n", - " self.peak_memory = torch.cuda.max_memory_allocated() / 1024**2 # MB\n", - " self.memory_log[\"peak_memory\"] = self.peak_memory\n", - " \n", - " def log_memory(self, phase):\n", - " \"\"\"Log memory usage at a specific phase\"\"\"\n", - " if phase in self.memory_log:\n", - " self.memory_log[phase] = get_gpu_memory_info()[\"allocated_memory_mb\"] if torch.cuda.is_available() else 0\n", - " self.update_peak()\n", - " \n", - " def reset_peak(self):\n", - " \"\"\"Reset peak memory stats\"\"\"\n", - " if torch.cuda.is_available():\n", - " torch.cuda.reset_peak_memory_stats()\n", - " \n", - " def report(self):\n", - " \"\"\"Print memory usage report\"\"\"\n", - " print(f\"\\n==== Memory Usage Report for {self.collaborator_name} ====\")\n", - " for phase, memory in self.memory_log.items():\n", - " if memory is not None:\n", - " print(f\"{phase}: {memory:.2f} MB\")\n", - " else:\n", - " print(f\"{phase}: Not measured\")\n", - " print(f\"Quantization type: {self.quant_type}\")\n", - " print(\"=\"*50)\n", - " \n", - " def get_stats(self):\n", - " \"\"\"Get all stats in a dictionary format for aggregation\"\"\"\n", - " stats = {k: v for k, v in self.memory_log.items()}\n", - " stats[\"training_loss\"] = self.training_loss\n", - " stats[\"eval_loss\"] = self.eval_loss\n", - " stats[\"quant_type\"] = self.quant_type\n", - " return stats" - ] - }, - { - "cell_type": "markdown", - "id": "78f47784", - "metadata": {}, - "source": [ - "## Memory and Loss Monitoring Utilities" - ] - }, - { - "cell_type": "markdown", - "id": "805feb3c", - "metadata": {}, - "source": [ - "## Visualization Functions" - ] - }, { "cell_type": "code", "execution_count": 17, - "id": "cf20ad92", - "metadata": {}, - "outputs": [], - "source": [ - "def plot_loss_metrics(flow_4bit, flow_8bit):\n", - " \"\"\"Plot and compare loss metrics between 4-bit and 8-bit quantization.\"\"\"\n", - " try:\n", - " import matplotlib.pyplot as plt\n", - " import pandas as pd\n", - " \n", - " # Create figure with two subplots\n", - " fig, axs = plt.subplots(1, 2, figsize=(16, 6))\n", - " fig.suptitle('Training and Evaluation Loss: 4-bit vs 8-bit Quantization', fontsize=16)\n", - " \n", - " # Prepare data\n", - " rounds = list(range(1, len(flow_4bit.average_loss_history) + 1))\n", - " \n", - " # Plot training loss\n", - " axs[0].set_title('Training Loss by Round')\n", - " axs[0].plot(rounds, flow_4bit.average_loss_history, 'o-', label='4-bit', color='blue')\n", - " axs[0].plot(rounds, flow_8bit.average_loss_history, 's-', label='8-bit', color='red')\n", - " axs[0].set_xlabel('Round')\n", - " axs[0].set_ylabel('Average Training Loss')\n", - " axs[0].legend()\n", - " axs[0].grid(True, alpha=0.3)\n", - " \n", - " # Plot evaluation loss\n", - " axs[1].set_title('Evaluation Loss by Round')\n", - " axs[1].plot(rounds, flow_4bit.local_model_loss_history, 'o-', label='4-bit (Local)', color='blue')\n", - " axs[1].plot(rounds, flow_8bit.local_model_loss_history, 's-', label='8-bit (Local)', color='red')\n", - " axs[1].plot(rounds, flow_4bit.agg_model_loss_history, 'o--', label='4-bit (Agg)', color='lightblue')\n", - " axs[1].plot(rounds, flow_8bit.agg_model_loss_history, 's--', label='8-bit (Agg)', color='salmon')\n", - " axs[1].set_xlabel('Round')\n", - " axs[1].set_ylabel('Evaluation Loss')\n", - " axs[1].legend()\n", - " axs[1].grid(True, alpha=0.3)\n", - " \n", - " plt.tight_layout()\n", - " plt.subplots_adjust(top=0.88)\n", - " plt.show()\n", - " \n", - " # Print textual summary\n", - " print(\"\\nLoss Metrics Summary:\")\n", - " print(f\"Final Training Loss: 4-bit = {flow_4bit.average_loss_history[-1]:.4f}, 8-bit = {flow_8bit.average_loss_history[-1]:.4f}\")\n", - " print(f\"Training Loss Difference: {abs(flow_4bit.average_loss_history[-1] - flow_8bit.average_loss_history[-1]):.4f}\")\n", - " \n", - " print(f\"\\nFinal Local Eval Loss: 4-bit = {flow_4bit.local_model_loss_history[-1]:.4f}, 8-bit = {flow_8bit.local_model_loss_history[-1]:.4f}\")\n", - " print(f\"Local Eval Loss Difference: {abs(flow_4bit.local_model_loss_history[-1] - flow_8bit.local_model_loss_history[-1]):.4f}\")\n", - " \n", - " print(f\"\\nFinal Aggregated Eval Loss: 4-bit = {flow_4bit.agg_model_loss_history[-1]:.4f}, 8-bit = {flow_8bit.agg_model_loss_history[-1]:.4f}\")\n", - " print(f\"Aggregated Eval Loss Difference: {abs(flow_4bit.agg_model_loss_history[-1] - flow_8bit.agg_model_loss_history[-1]):.4f}\")\n", - " \n", - " better_training = \"4-bit\" if flow_4bit.average_loss_history[-1] < flow_8bit.average_loss_history[-1] else \"8-bit\"\n", - " better_local = \"4-bit\" if flow_4bit.local_model_loss_history[-1] < flow_8bit.local_model_loss_history[-1] else \"8-bit\"\n", - " better_agg = \"4-bit\" if flow_4bit.agg_model_loss_history[-1] < flow_8bit.agg_model_loss_history[-1] else \"8-bit\"\n", - " \n", - " print(f\"\\nBest Training Performance: {better_training}\")\n", - " print(f\"Best Local Evaluation Performance: {better_local}\")\n", - " print(f\"Best Aggregated Evaluation Performance: {better_agg}\")\n", - " \n", - " except ImportError:\n", - " print(\"Plotting requires matplotlib and pandas. Install with: pip install matplotlib pandas\")\n", - " except Exception as e:\n", - " print(f\"Error plotting metrics: {str(e)}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "2668f39a-537e-4b4e-abfa-6b297e3aaa36", + "id": "63f3d4e1", "metadata": {}, "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stdout", "output_type": "stream", "text": [ - "Visualization libraries installed and imported successfully.\n" + "\n", + "==== Loss Comparison: 4-bit vs 8-bit ====\n", + "\n", + "Training Loss (4-bit): 0.1754 ± 0.0877\n", + "Training Loss (8-bit): inf ± nan\n", + "\n", + "Eval Loss (4-bit): 0.4618 ± 0.0725\n", + "Eval Loss (8-bit): 0.4909 ± 0.0776\n" ] } ], "source": [ - "!pip install seaborn matplotlib pandas -q\n", - "\n", - "# Import the libraries\n", - "import matplotlib.pyplot as plt\n", - "import pandas as pd\n", - "import seaborn as sns\n", - "\n", - "print(\"Visualization libraries installed and imported successfully.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "35cb4b3a", - "metadata": {}, - "outputs": [], - "source": [ - "# Visualization functions for memory usage and training loss\n", - "import matplotlib.pyplot as plt\n", - "import pandas as pd\n", - "import seaborn as sns\n", - "\n", - "def plot_memory_metrics(flow_4bit, flow_8bit=None):\n", - " \"\"\"Plot memory usage metrics comparing 4-bit and 8-bit quantization\"\"\"\n", - " # Extract and organize memory data for 4-bit\n", - " memory_data = []\n", - " \n", - " for collab, rounds_data in flow_4bit.all_memory_stats.items():\n", - " for round_name, stats in rounds_data.items():\n", - " # Extract round number from round_name (e.g., 'round_0' -> 0)\n", - " round_num = int(round_name.split('_')[1]) if '_' in round_name else 0\n", - " quant_type = stats.get(\"quant_type\", \"4bit\")\n", - " \n", - " # Extract memory data\n", - " for phase, memory in stats.items():\n", - " if memory is not None and phase not in ['training_loss', 'eval_loss', 'quant_type']:\n", - " memory_data.append({\n", - " \"Collaborator\": collab,\n", - " \"Round\": round_name,\n", - " \"Round Number\": round_num,\n", - " \"Phase\": phase,\n", - " \"Memory (MB)\": memory,\n", - " \"Quantization\": quant_type\n", - " })\n", - " \n", - " # Add 8-bit data if provided\n", - " if flow_8bit is not None:\n", - " for collab, rounds_data in flow_8bit.all_memory_stats.items():\n", - " for round_name, stats in rounds_data.items():\n", - " # Extract round number from round_name (e.g., 'round_0' -> 0)\n", - " round_num = int(round_name.split('_')[1]) if '_' in round_name else 0\n", - " quant_type = stats.get(\"quant_type\", \"8bit\")\n", - " \n", - " # Extract memory data\n", - " for phase, memory in stats.items():\n", - " if memory is not None and phase not in ['training_loss', 'eval_loss', 'quant_type']:\n", - " memory_data.append({\n", - " \"Collaborator\": collab,\n", - " \"Round\": round_name,\n", - " \"Round Number\": round_num,\n", - " \"Phase\": phase,\n", - " \"Memory (MB)\": memory,\n", - " \"Quantization\": quant_type\n", - " })\n", - " \n", - " if not memory_data:\n", - " print(\"No memory data collected\")\n", - " return\n", - " \n", - " memory_df = pd.DataFrame(memory_data)\n", - " \n", - " # Create a figure with subplots for memory metrics\n", - " fig, axes = plt.subplots(2, 1, figsize=(15, 14), gridspec_kw={'height_ratios': [1, 0.7]})\n", - " \n", - " # 1. Memory usage by phase for each quantization (top plot)\n", - " if flow_8bit is not None:\n", - " sns.barplot(x=\"Phase\", y=\"Memory (MB)\", hue=\"Quantization\", data=memory_df, ax=axes[0])\n", - " axes[0].set_title(\"Memory Usage by Phase and Quantization Type\", fontsize=14, fontweight='bold')\n", - " else:\n", - " sns.barplot(x=\"Phase\", y=\"Memory (MB)\", hue=\"Collaborator\", data=memory_df, ax=axes[0])\n", - " axes[0].set_title(\"Memory Usage by Phase and Collaborator\", fontsize=14, fontweight='bold')\n", - " \n", - " axes[0].set_xlabel(\"Phase\", fontsize=12)\n", - " axes[0].set_ylabel(\"Memory (MB)\", fontsize=12)\n", - " axes[0].tick_params(axis='x', rotation=45)\n", - " axes[0].legend(title=\"Quantization\" if flow_8bit else \"Collaborator\", bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " \n", - " # 2. Peak memory across rounds (bottom plot)\n", - " peak_data = memory_df[memory_df[\"Phase\"] == \"peak\"]\n", - " if not peak_data.empty:\n", - " group_var = \"Quantization\" if flow_8bit else \"Collaborator\"\n", - " sns.lineplot(\n", - " x=\"Round Number\", \n", - " y=\"Memory (MB)\", \n", - " hue=group_var, \n", - " data=peak_data, \n", - " marker='o', \n", - " sort=True,\n", - " linewidth=3,\n", - " markersize=10,\n", - " ax=axes[1]\n", - " )\n", - " axes[1].set_title(\"Peak Memory Usage Across Rounds\", fontsize=14, fontweight='bold')\n", - " axes[1].set_xlabel(\"Round\", fontsize=12)\n", - " axes[1].set_ylabel(\"Memory (MB)\", fontsize=12)\n", - " axes[1].legend(title=group_var, bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " axes[1].grid(True, linestyle='--', alpha=0.7)\n", - " \n", - " plt.tight_layout()\n", - " plt.savefig('memory_metrics_comparison.png', dpi=300, bbox_inches='tight')\n", - " plt.show()\n", - "\n", - "def plot_loss_metrics(flow_4bit, flow_8bit=None):\n", - " \"\"\"Plot training and evaluation loss metrics comparing 4-bit and 8-bit quantization\"\"\"\n", - " # Extract and organize loss data for 4-bit\n", - " loss_data = []\n", - " \n", - " for collab, rounds_data in flow_4bit.all_memory_stats.items():\n", - " for round_name, stats in rounds_data.items():\n", - " # Extract round number from round_name (e.g., 'round_0' -> 0)\n", - " round_num = int(round_name.split('_')[1]) if '_' in round_name else 0\n", - " quant_type = stats.get(\"quant_type\", \"4bit\")\n", - " \n", - " # Extract loss data\n", - " if 'training_loss' in stats and stats['training_loss'] is not None:\n", - " loss_data.append({\n", - " \"Collaborator\": collab,\n", - " \"Round\": round_name,\n", - " \"Round Number\": round_num,\n", - " \"Metric\": \"Training Loss\",\n", - " \"Value\": stats['training_loss'],\n", - " \"Quantization\": quant_type\n", - " })\n", - " if 'eval_loss' in stats and stats['eval_loss'] is not None:\n", - " loss_data.append({\n", - " \"Collaborator\": collab,\n", - " \"Round\": round_name,\n", - " \"Round Number\": round_num,\n", - " \"Metric\": \"Evaluation Loss\",\n", - " \"Value\": stats['eval_loss'],\n", - " \"Quantization\": quant_type\n", - " })\n", - " \n", - " # Add 8-bit data if provided\n", - " if flow_8bit is not None:\n", - " for collab, rounds_data in flow_8bit.all_memory_stats.items():\n", - " for round_name, stats in rounds_data.items():\n", - " # Extract round number from round_name (e.g., 'round_0' -> 0)\n", - " round_num = int(round_name.split('_')[1]) if '_' in round_name else 0\n", - " quant_type = stats.get(\"quant_type\", \"8bit\")\n", - " \n", - " # Extract loss data\n", - " if 'training_loss' in stats and stats['training_loss'] is not None:\n", - " loss_data.append({\n", - " \"Collaborator\": collab,\n", - " \"Round\": round_name,\n", - " \"Round Number\": round_num,\n", - " \"Metric\": \"Training Loss\",\n", - " \"Value\": stats['training_loss'],\n", - " \"Quantization\": quant_type\n", - " })\n", - " if 'eval_loss' in stats and stats['eval_loss'] is not None:\n", - " loss_data.append({\n", - " \"Collaborator\": collab,\n", - " \"Round\": round_name,\n", - " \"Round Number\": round_num,\n", - " \"Metric\": \"Evaluation Loss\",\n", - " \"Value\": stats['eval_loss'],\n", - " \"Quantization\": quant_type\n", - " })\n", - " \n", - " if not loss_data:\n", - " print(\"No loss data collected\")\n", - " return\n", - " \n", - " loss_df = pd.DataFrame(loss_data)\n", - " \n", - " # Create a figure with subplots for loss metrics\n", - " if flow_8bit is None:\n", - " fig, axes = plt.subplots(2, 1, figsize=(15, 12), gridspec_kw={'height_ratios': [1, 0.8]})\n", - " \n", - " # 1. Training and eval loss per round (top plot)\n", - " sns.lineplot(\n", - " x=\"Round Number\", \n", - " y=\"Value\", \n", - " hue=\"Collaborator\", \n", - " style=\"Metric\", \n", - " data=loss_df, \n", - " marker='o', \n", - " sort=True,\n", - " linewidth=3,\n", - " markersize=10,\n", - " ax=axes[0]\n", - " )\n", - " axes[0].set_title(\"Training and Evaluation Loss by Round\", fontsize=14, fontweight='bold')\n", - " axes[0].set_xlabel(\"Round\", fontsize=12)\n", - " axes[0].set_ylabel(\"Loss\", fontsize=12)\n", - " axes[0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " axes[0].grid(True, linestyle='--', alpha=0.7)\n", - " \n", - " # 2. Boxplot of loss distribution by round (bottom plot)\n", - " sns.boxplot(x=\"Round\", y=\"Value\", hue=\"Metric\", data=loss_df, ax=axes[1])\n", - " axes[1].set_title(\"Loss Distribution Across Rounds\", fontsize=14, fontweight='bold')\n", - " axes[1].set_xlabel(\"Round\", fontsize=12)\n", - " axes[1].set_ylabel(\"Loss Value\", fontsize=12)\n", - " axes[1].legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " else:\n", - " # Comparison between 4-bit and 8-bit\n", - " fig, axes = plt.subplots(2, 1, figsize=(15, 12), gridspec_kw={'height_ratios': [1, 1]})\n", - " \n", - " # 1. Training loss comparison\n", - " training_loss_df = loss_df[loss_df[\"Metric\"] == \"Training Loss\"]\n", - " sns.lineplot(\n", - " x=\"Round Number\", \n", - " y=\"Value\", \n", - " hue=\"Quantization\", \n", - " style=\"Collaborator\", \n", - " data=training_loss_df, \n", - " marker='o', \n", - " sort=True,\n", - " linewidth=3,\n", - " markersize=10,\n", - " ax=axes[0]\n", - " )\n", - " axes[0].set_title(\"Training Loss Comparison: 4-bit vs 8-bit\", fontsize=14, fontweight='bold')\n", - " axes[0].set_xlabel(\"Round\", fontsize=12)\n", - " axes[0].set_ylabel(\"Training Loss\", fontsize=12)\n", - " axes[0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " axes[0].grid(True, linestyle='--', alpha=0.7)\n", - " \n", - " # 2. Evaluation loss comparison\n", - " eval_loss_df = loss_df[loss_df[\"Metric\"] == \"Evaluation Loss\"]\n", - " sns.lineplot(\n", - " x=\"Round Number\", \n", - " y=\"Value\", \n", - " hue=\"Quantization\", \n", - " style=\"Collaborator\", \n", - " data=eval_loss_df, \n", - " marker='o', \n", - " sort=True,\n", - " linewidth=3,\n", - " markersize=10,\n", - " ax=axes[1]\n", - " )\n", - " axes[1].set_title(\"Evaluation Loss Comparison: 4-bit vs 8-bit\", fontsize=14, fontweight='bold')\n", - " axes[1].set_xlabel(\"Round\", fontsize=12)\n", - " axes[1].set_ylabel(\"Evaluation Loss\", fontsize=12)\n", - " axes[1].legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " axes[1].grid(True, linestyle='--', alpha=0.7)\n", - " \n", - " plt.tight_layout()\n", - " plt.savefig('loss_metrics_comparison.png', dpi=300, bbox_inches='tight')\n", - " plt.show()\n", - "\n", - "def plot_aggregated_metrics(flow_4bit, flow_8bit):\n", - " \"\"\"Plot aggregated metrics comparing 4-bit and 8-bit quantization\"\"\"\n", - " if (not hasattr(flow_4bit, 'average_loss_history') or not flow_4bit.average_loss_history or\n", - " not hasattr(flow_8bit, 'average_loss_history') or not flow_8bit.average_loss_history):\n", - " print(\"Not enough aggregated metrics data available\")\n", - " return\n", - " \n", - " # Create comparison dataframes\n", - " rounds_4bit = list(range(len(flow_4bit.average_loss_history)))\n", - " data_4bit = pd.DataFrame({\n", - " 'Round': rounds_4bit,\n", - " 'Average Training Loss': flow_4bit.average_loss_history,\n", - " 'Aggregated Model Loss': flow_4bit.agg_model_loss_history,\n", - " 'Local Model Loss': flow_4bit.local_model_loss_history,\n", - " 'Quantization': '4-bit'\n", - " })\n", - " \n", - " rounds_8bit = list(range(len(flow_8bit.average_loss_history)))\n", - " data_8bit = pd.DataFrame({\n", - " 'Round': rounds_8bit,\n", - " 'Average Training Loss': flow_8bit.average_loss_history,\n", - " 'Aggregated Model Loss': flow_8bit.agg_model_loss_history,\n", - " 'Local Model Loss': flow_8bit.local_model_loss_history,\n", - " 'Quantization': '8-bit'\n", - " })\n", - " \n", - " # Combine data\n", - " combined_data = pd.concat([data_4bit, data_8bit])\n", - " \n", - " # Melt for easier plotting\n", - " melted_data = pd.melt(\n", - " combined_data,\n", - " id_vars=['Round', 'Quantization'],\n", - " value_vars=['Average Training Loss', 'Aggregated Model Loss', 'Local Model Loss'],\n", - " var_name='Metric',\n", - " value_name='Loss'\n", - " )\n", - " \n", - " # Plot comparison\n", - " plt.figure(figsize=(15, 8))\n", - " sns.lineplot(\n", - " data=melted_data,\n", - " x='Round',\n", - " y='Loss',\n", - " hue='Quantization',\n", - " style='Metric',\n", - " markers=True,\n", - " dashes=True,\n", - " linewidth=3\n", - " )\n", - " \n", - " plt.title('Comparison of 4-bit vs 8-bit Quantization Performance', fontsize=16, fontweight='bold')\n", - " plt.xlabel('Round', fontsize=14)\n", - " plt.ylabel('Loss', fontsize=14)\n", - " plt.grid(True, linestyle='--', alpha=0.7)\n", - " plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " plt.tight_layout()\n", - " plt.savefig('aggregated_comparison.png', dpi=300, bbox_inches='tight')\n", - " plt.show()" + "# Visualize training and validation loss across quantization methods\n", + "plot_loss_metrics(flflow_4bit, flflow_8bit)" ] }, { "cell_type": "code", - "execution_count": 20, - "id": "61c7da64", + "execution_count": 18, + "id": "c3be3b11", "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "==== Training & Evaluation Loss Comparison: 4-bit vs 8-bit ====\n", - "\n" - ] - }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ - "
" + "
" ] }, "metadata": {}, "output_type": "display_data" - } - ], - "source": [ - "# Plot training loss metrics comparison between 4-bit and 8-bit\n", - "print(\"\\n==== Training & Evaluation Loss Comparison: 4-bit vs 8-bit ====\\n\")\n", - "plot_loss_metrics(flflow_4bit, flflow_8bit)" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "397cb9c7", - "metadata": {}, - "outputs": [ + }, { "name": "stdout", "output_type": "stream", "text": [ "\n", - "==== Aggregated Performance Metrics: 4-bit vs 8-bit ====\n", - "\n" + "==== Percentage Difference (8-bit vs 4-bit) ====\n", + "\n", + "Avg Training Loss: 8-bit is inf% higher than 4-bit\n", + "Agg Model Loss: 8-bit is 0.52% higher than 4-bit\n", + "Local Model Loss: 8-bit is 1.13% lower than 4-bit\n" ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" } ], "source": [ - "# Plot aggregated metrics comparison\n", - "print(\"\\n==== Aggregated Performance Metrics: 4-bit vs 8-bit ====\\n\")\n", + "# Visualize aggregated metrics (memory vs. performance tradeoff)\n", "plot_aggregated_metrics(flflow_4bit, flflow_8bit)" ] }, { "cell_type": "markdown", - "id": "4c089e09", + "id": "9a4de940", "metadata": {}, "source": [ - "### Memory Efficiency Comparison\n", + "## Conclusion\n", + "\n", + "This notebook has demonstrated how to implement federated fine-tuning of Microsoft's Phi-4 model using OpenFL with both 4-bit and 8-bit quantization approaches. The visualization and analysis above help us understand the tradeoffs between these quantization methods:\n", + "\n", + "### Memory Usage and Performance Comparison\n", + "\n", + "- **Memory Footprint**: 4-bit quantization used approximately 2.6% less memory (55,770 MB vs 57,204 MB) compared to 8-bit quantization.\n", + "\n", + "- **Model Quality**: 4-bit quantization achieved better loss metrics overall:\n", + " - Training Loss: 0.1754 ± 0.0877 for 4-bit vs. significantly higher for 8-bit\n", + " - Evaluation Loss: 0.4618 ± 0.0725 for 4-bit vs. 0.4909 ± 0.0776 for 8-bit (6.3% higher)\n", + "\n", + "- **Performance Difference by Metric**:\n", + " - Average Training Loss: 8-bit significantly underperformed compared to 4-bit\n", + " - Aggregated Model Loss: 8-bit was 0.52% higher (worse) than 4-bit\n", + " - Local Model Loss: 8-bit was 1.13% lower (better) than 4-bit\n", "\n", - "- **4-bit Quantization**: Uses less memory overall, allowing for larger batch sizes or model sizes on the same hardware.\n", - "- **8-bit Quantization**: Requires more memory but still offers significant savings compared to full precision (FP16/FP32).\n", - "- **Peak Memory Usage**: The difference in peak memory consumption shows the trade-off between precision and memory requirements.\n", + "### Key Insights\n", "\n", - "### Training Performance Comparison\n", + "1. **Memory-Performance Tradeoff**: While 8-bit quantization required slightly more memory, the relative performance differences in evaluation metrics were more significant, suggesting 4-bit quantization offers a better memory-performance balance for this model and task.\n", "\n", - "- **Training Loss**: 8-bit quantization typically maintains closer fidelity to the original model, potentially leading to slightly better training convergence.\n", - "- **Evaluation Loss**: The evaluation metrics help determine if the higher precision of 8-bit quantization translates to better model performance.\n", + "2. **Training Stability**: The 4-bit quantization approach demonstrated more stable and better training performance compared to 8-bit quantization.\n", "\n", - "### Use Case Recommendations\n", + "3. **Efficiency Considerations**: Despite the memory analysis suggesting 8-bit provides more efficient memory usage relative to loss in some metrics, the overall performance profile favors 4-bit quantization for practical federated learning deployments.\n", "\n", - "- **Resource-constrained environments**: 4-bit quantization provides better memory efficiency for edge devices or limited GPU resources.\n", - "- **Higher precision needs**: If model accuracy is critical and resources permit, 8-bit quantization offers a good balance between performance and efficiency.\n", - "- **Federated Learning Impact**: The quantization choice particularly affects resource utilization across collaborators in the federated setting." + "By combining federated learning with appropriate quantization techniques, we can successfully fine-tune large language models while balancing computational resource constraints across federated devices. For this Phi-4 model, the 4-bit quantization approach appears to offer the better balance of memory efficiency and model performance." ] } ], diff --git a/openfl/utilities/phi_utils.py b/openfl/utilities/phi_utils.py new file mode 100644 index 0000000000..1d914b8412 --- /dev/null +++ b/openfl/utilities/phi_utils.py @@ -0,0 +1,434 @@ +""" +Utility functions for Phi-4 model quantization and federated learning experiments. +This module contains: +- Memory tracking utilities +- Visualization functions for comparing 4-bit and 8-bit quantization +""" + +import torch +import matplotlib.pyplot as plt +import pandas as pd +import seaborn as sns +import numpy as np +from matplotlib.ticker import EngFormatter + +def get_gpu_memory_info(): + """Get GPU memory usage information in MB.""" + try: + if torch.cuda.is_available(): + allocated = torch.cuda.memory_allocated() / (1024 * 1024) + reserved = torch.cuda.memory_reserved() / (1024 * 1024) + max_allocated = torch.cuda.max_memory_allocated() / (1024 * 1024) + return { + "allocated": allocated, + "reserved": reserved, + "max_allocated": max_allocated + } + else: + return {"allocated": 0, "reserved": 0, "max_allocated": 0} + except: + return {"allocated": 0, "reserved": 0, "max_allocated": 0} + +class MemoryTracker: + """Track GPU memory usage during training""" + def __init__(self, collaborator_name, quant_type): + self.collaborator_name = collaborator_name + self.quant_type = quant_type + self.timestamps = {} + self.peak = {"allocated": 0, "reserved": 0, "max_allocated": 0} + self.training_loss = None + self.eval_loss = None + + def log(self, timestamp): + """Log current memory usage at a specific timestamp""" + self.timestamps[timestamp] = get_gpu_memory_info() + + def log_loss(self, training_loss=None, eval_loss=None): + """Log training or evaluation loss""" + if training_loss is not None: + self.training_loss = training_loss + if eval_loss is not None: + self.eval_loss = eval_loss + + def update_peak(self): + """Update peak memory usage values""" + current = get_gpu_memory_info() + self.peak["allocated"] = max(self.peak["allocated"], current["allocated"]) + self.peak["reserved"] = max(self.peak["reserved"], current["reserved"]) + self.peak["max_allocated"] = max(self.peak["max_allocated"], current["max_allocated"]) + + def reset_peak(self): + """Reset peak memory usage values""" + self.peak = {"allocated": 0, "reserved": 0, "max_allocated": 0} + + def report(self): + """Print memory usage report""" + print(f"\n==== Memory Usage Report for {self.collaborator_name} ({self.quant_type}) ====") + print(f"Peak Memory Usage:") + print(f" Allocated: {self.peak['allocated']:.2f} MB") + print(f" Reserved: {self.peak['reserved']:.2f} MB") + print(f" Max Allocated: {self.peak['max_allocated']:.2f} MB") + + print("\nMemory Usage by Stage:") + for timestamp, mem in self.timestamps.items(): + print(f" {timestamp}:") + print(f" Allocated: {mem['allocated']:.2f} MB") + print(f" Reserved: {mem['reserved']:.2f} MB") + print(f" Max Allocated: {mem['max_allocated']:.2f} MB") + + print("\nPerformance Metrics:") + if self.training_loss is not None: + print(f" Training Loss: {self.training_loss:.4f}") + if self.eval_loss is not None: + print(f" Evaluation Loss: {self.eval_loss:.4f}") + print("-" * 50) + + def get_stats(self): + """Get all statistics as a dictionary""" + stats = { + "peak_allocated": self.peak["allocated"], + "peak_reserved": self.peak["reserved"], + "peak_max_allocated": self.peak["max_allocated"], + "quant_type": self.quant_type, + "training_loss": self.training_loss, + "eval_loss": self.eval_loss + } + for timestamp, mem in self.timestamps.items(): + stats[f"{timestamp}_allocated"] = mem["allocated"] + stats[f"{timestamp}_reserved"] = mem["reserved"] + stats[f"{timestamp}_max_allocated"] = mem["max_allocated"] + return stats + +def plot_memory_metrics(flow_4bit, flow_8bit): + """Plot and compare memory metrics between 4-bit and 8-bit quantization.""" + try: + # Create figure with multiple subplots + fig, axs = plt.subplots(2, 2, figsize=(16, 12)) + fig.suptitle('4-bit vs 8-bit Quantization Comparison', fontsize=16) + + # Colors for consistent plotting + colors_4bit = {'Portland': 'blue', 'Seattle': 'green'} + colors_8bit = {'Portland': 'darkblue', 'Seattle': 'darkgreen'} + markers_4bit = {'Portland': 'o', 'Seattle': 's'} + markers_8bit = {'Portland': '^', 'Seattle': 'D'} + + # Flatten the metric data for plotting + memory_data = [] + for quant, flow in [("4-bit", flow_4bit), ("8-bit", flow_8bit)]: + stats = flow.all_memory_stats + for collab, rounds_data in stats.items(): + for round_name, metrics in rounds_data.items(): + round_num = int(round_name.split('_')[1]) + row = { + 'Collaborator': collab, + 'Round': round_num, + 'Quantization': quant, + 'Peak Memory (MB)': metrics.get('peak_max_allocated', 0), + 'Training Loss': metrics.get('training_loss', 0), + 'Eval Loss': metrics.get('eval_loss', 0) + } + memory_data.append(row) + + df = pd.DataFrame(memory_data) + + # Plot 1: Peak Memory Usage by Round + axs[0, 0].set_title('Peak Memory Usage by Round') + for quant_type in ['4-bit', '8-bit']: + for collab in df['Collaborator'].unique(): + subset = df[(df['Quantization'] == quant_type) & (df['Collaborator'] == collab)] + color = colors_4bit[collab] if quant_type == '4-bit' else colors_8bit[collab] + marker = markers_4bit[collab] if quant_type == '4-bit' else markers_8bit[collab] + axs[0, 0].plot(subset['Round'], subset['Peak Memory (MB)'], + marker=marker, linestyle='-', label=f"{collab} ({quant_type})", + color=color) + + axs[0, 0].set_xlabel('Round') + axs[0, 0].set_ylabel('Memory (MB)') + axs[0, 0].legend() + axs[0, 0].grid(True, alpha=0.3) + axs[0, 0].yaxis.set_major_formatter(EngFormatter(unit='B')) + + # Plot 2: Training Loss by Round + axs[0, 1].set_title('Training Loss by Round') + for quant_type in ['4-bit', '8-bit']: + for collab in df['Collaborator'].unique(): + subset = df[(df['Quantization'] == quant_type) & (df['Collaborator'] == collab)] + color = colors_4bit[collab] if quant_type == '4-bit' else colors_8bit[collab] + marker = markers_4bit[collab] if quant_type == '4-bit' else markers_8bit[collab] + axs[0, 1].plot(subset['Round'], subset['Training Loss'], + marker=marker, linestyle='-', label=f"{collab} ({quant_type})", + color=color) + + axs[0, 1].set_xlabel('Round') + axs[0, 1].set_ylabel('Loss') + axs[0, 1].legend() + axs[0, 1].grid(True, alpha=0.3) + + # Plot 3: Eval Loss by Round + axs[1, 0].set_title('Evaluation Loss by Round') + for quant_type in ['4-bit', '8-bit']: + for collab in df['Collaborator'].unique(): + subset = df[(df['Quantization'] == quant_type) & (df['Collaborator'] == collab)] + color = colors_4bit[collab] if quant_type == '4-bit' else colors_8bit[collab] + marker = markers_4bit[collab] if quant_type == '4-bit' else markers_8bit[collab] + axs[1, 0].plot(subset['Round'], subset['Eval Loss'], + marker=marker, linestyle='-', label=f"{collab} ({quant_type})", + color=color) + + axs[1, 0].set_xlabel('Round') + axs[1, 0].set_ylabel('Loss') + axs[1, 0].legend() + axs[1, 0].grid(True, alpha=0.3) + + # Plot 4: Memory vs Loss (bubble chart) + axs[1, 1].set_title('Memory Usage vs. Evaluation Loss') + for quant_type in ['4-bit', '8-bit']: + for collab in df['Collaborator'].unique(): + subset = df[(df['Quantization'] == quant_type) & (df['Collaborator'] == collab)] + color = colors_4bit[collab] if quant_type == '4-bit' else colors_8bit[collab] + marker = markers_4bit[collab] if quant_type == '4-bit' else markers_8bit[collab] + + # Size proportional to round number for visual differentiation + sizes = [100 * (r+1) for r in subset['Round']] + + axs[1, 1].scatter(subset['Peak Memory (MB)'], subset['Eval Loss'], + s=sizes, alpha=0.7, + label=f"{collab} ({quant_type})", + color=color, marker=marker) + + # Add round number annotations + for _, row in subset.iterrows(): + axs[1, 1].annotate(f"R{int(row['Round'])}", + (row['Peak Memory (MB)'], row['Eval Loss']), + xytext=(5, 5), textcoords='offset points') + + axs[1, 1].set_xlabel('Peak Memory (MB)') + axs[1, 1].set_ylabel('Evaluation Loss') + axs[1, 1].legend() + axs[1, 1].grid(True, alpha=0.3) + axs[1, 1].xaxis.set_major_formatter(EngFormatter(unit='B')) + + plt.tight_layout() + plt.subplots_adjust(top=0.92) + plt.show() + + # Print summary comparison + print("\n==== Performance Summary ====\n") + # Group by quantization and compute means + summary = df.groupby('Quantization').agg({ + 'Peak Memory (MB)': 'mean', + 'Training Loss': 'mean', + 'Eval Loss': 'mean' + }).reset_index() + + # Calculate percentage difference + mem_diff_pct = ((summary.loc[1, 'Peak Memory (MB)'] - summary.loc[0, 'Peak Memory (MB)']) / + summary.loc[0, 'Peak Memory (MB)'] * 100) + + eval_diff_pct = ((summary.loc[1, 'Eval Loss'] - summary.loc[0, 'Eval Loss']) / + summary.loc[0, 'Eval Loss'] * 100) + + print(f"Memory Usage Comparison:") + print(f" 4-bit Avg: {summary.loc[0, 'Peak Memory (MB)']:.2f} MB") + print(f" 8-bit Avg: {summary.loc[1, 'Peak Memory (MB)']:.2f} MB") + print(f" Difference: {abs(mem_diff_pct):.1f}% {'more' if mem_diff_pct > 0 else 'less'} memory with 8-bit") + + print(f"\nEvaluation Loss Comparison:") + print(f" 4-bit Avg: {summary.loc[0, 'Eval Loss']:.4f}") + print(f" 8-bit Avg: {summary.loc[1, 'Eval Loss']:.4f}") + print(f" Difference: {abs(eval_diff_pct):.1f}% {'higher' if eval_diff_pct > 0 else 'lower'} loss with 8-bit") + + loss_efficiency = ((summary.loc[0, 'Eval Loss'] - summary.loc[1, 'Eval Loss']) / + (summary.loc[0, 'Peak Memory (MB)'] - summary.loc[1, 'Peak Memory (MB)'])) + + if loss_efficiency > 0: + efficiency_msg = "8-bit provides more efficiency memory usage relative to loss" + else: + efficiency_msg = "4-bit provides more efficiency memory usage relative to loss" + + print(f"\nEfficiency Analysis: {efficiency_msg}") + except ImportError: + print("Plotting requires matplotlib and pandas. Install with: pip install matplotlib pandas") + except Exception as e: + print(f"Error plotting metrics: {str(e)}") + +def plot_loss_metrics(flow_4bit, flow_8bit): + """Plot training and evaluation loss metrics comparing 4-bit and 8-bit quantization""" + # Extract and organize loss data + loss_data = [] + + # Helper function to safely convert tensor to float value + def tensor_to_float(val): + if val is None: + return None + if isinstance(val, torch.Tensor): + return val.detach().cpu().float().numpy().item() + return val + + # Process 4-bit data + for collab, rounds_data in flow_4bit.all_memory_stats.items(): + for round_name, stats in rounds_data.items(): + round_num = int(round_name.split('_')[1]) if '_' in round_name else 0 + quant_type = stats.get("quant_type", "4bit") + training_loss = tensor_to_float(stats.get("training_loss")) + eval_loss = tensor_to_float(stats.get("eval_loss")) + + if training_loss is not None or eval_loss is not None: + loss_data.append({ + "Collaborator": collab, + "Round": round_name, + "Round Number": round_num, + "Training Loss": training_loss, + "Eval Loss": eval_loss, + "Quantization": quant_type + }) + + # Process 8-bit data if provided + if flow_8bit is not None: + for collab, rounds_data in flow_8bit.all_memory_stats.items(): + for round_name, stats in rounds_data.items(): + round_num = int(round_name.split('_')[1]) if '_' in round_name else 0 + quant_type = stats.get("quant_type", "8bit") + training_loss = tensor_to_float(stats.get("training_loss")) + eval_loss = tensor_to_float(stats.get("eval_loss")) + + if training_loss is not None or eval_loss is not None: + loss_data.append({ + "Collaborator": collab, + "Round": round_name, + "Round Number": round_num, + "Training Loss": training_loss, + "Eval Loss": eval_loss, + "Quantization": quant_type + }) + + loss_df = pd.DataFrame(loss_data) + + # Create a figure with subplots for loss metrics + fig, axes = plt.subplots(2, 1, figsize=(15, 12), gridspec_kw={'height_ratios': [1, 1]}) + + # 1. Training loss across rounds (top plot) + group_var = "Quantization" if flow_8bit else "Collaborator" + + sns.lineplot( + x="Round Number", + y="Training Loss", + hue=group_var, + data=loss_df, + marker='o', + sort=True, + linewidth=3, + markersize=10, + ax=axes[0] + ) + axes[0].set_title("Training Loss Across Rounds", fontsize=14, fontweight='bold') + axes[0].set_xlabel("Round", fontsize=12) + axes[0].set_ylabel("Loss", fontsize=12) + axes[0].legend(title=group_var, bbox_to_anchor=(1.05, 1), loc='upper left') + + # 2. Evaluation loss across rounds (bottom plot) + sns.lineplot( + x="Round Number", + y="Eval Loss", + hue=group_var, + data=loss_df, + marker='o', + sort=True, + linewidth=3, + markersize=10, + ax=axes[1] + ) + axes[1].set_title("Evaluation Loss Across Rounds", fontsize=14, fontweight='bold') + axes[1].set_xlabel("Round", fontsize=12) + axes[1].set_ylabel("Loss", fontsize=12) + axes[1].legend(title=group_var, bbox_to_anchor=(1.05, 1), loc='upper left') + + plt.tight_layout() + plt.show() + + # Print summary statistics + if flow_8bit: + print("\n==== Loss Comparison: 4-bit vs 8-bit ====\n") + + # Group by quantization and compute means + summary = loss_df.groupby('Quantization').agg({ + 'Training Loss': ['mean', 'std'], + 'Eval Loss': ['mean', 'std'] + }) + + print(f"Training Loss (4-bit): {summary.loc['4bit', ('Training Loss', 'mean')]:.4f} ± {summary.loc['4bit', ('Training Loss', 'std')]:.4f}") + print(f"Training Loss (8-bit): {summary.loc['8bit', ('Training Loss', 'mean')]:.4f} ± {summary.loc['8bit', ('Training Loss', 'std')]:.4f}") + print(f"\nEval Loss (4-bit): {summary.loc['4bit', ('Eval Loss', 'mean')]:.4f} ± {summary.loc['4bit', ('Eval Loss', 'std')]:.4f}") + print(f"Eval Loss (8-bit): {summary.loc['8bit', ('Eval Loss', 'mean')]:.4f} ± {summary.loc['8bit', ('Eval Loss', 'std')]:.4f}") + +def plot_aggregated_metrics(flow_4bit, flow_8bit): + """Plot aggregated metrics comparing 4-bit and 8-bit quantization""" + # Create a figure with subplots for aggregated metrics + fig, axes = plt.subplots(1, 2, figsize=(16, 6)) + + # Helper function to safely convert tensor to float value + def tensor_to_float(val): + if val is None: + return None + if isinstance(val, torch.Tensor): + return val.detach().cpu().float().numpy().item() + return val + + # Convert any tensor values to CPU before plotting + loss_history_4bit = [tensor_to_float(x) for x in flow_4bit.average_loss_history] + loss_history_8bit = [tensor_to_float(x) for x in flow_8bit.average_loss_history] + agg_model_loss_4bit = [tensor_to_float(x) for x in flow_4bit.agg_model_loss_history] + agg_model_loss_8bit = [tensor_to_float(x) for x in flow_8bit.agg_model_loss_history] + local_model_loss_4bit = [tensor_to_float(x) for x in flow_4bit.local_model_loss_history] + local_model_loss_8bit = [tensor_to_float(x) for x in flow_8bit.local_model_loss_history] + + # Setup data + rounds = list(range(len(loss_history_4bit))) + + # Plot average loss history + axes[0].plot(rounds, loss_history_4bit, 'bo-', linewidth=2, markersize=8, label='4-bit') + axes[0].plot(rounds, loss_history_8bit, 'ro-', linewidth=2, markersize=8, label='8-bit') + axes[0].set_title('Average Training Loss by Round', fontsize=14, fontweight='bold') + axes[0].set_xlabel('Round', fontsize=12) + axes[0].set_ylabel('Loss', fontsize=12) + axes[0].grid(True, alpha=0.3) + axes[0].legend(fontsize=10) + + # Plot final metrics comparison + metrics = ['Avg Training Loss', 'Agg Model Loss', 'Local Model Loss'] + values_4bit = [loss_history_4bit[-1], agg_model_loss_4bit[-1], local_model_loss_4bit[-1]] + values_8bit = [loss_history_8bit[-1], agg_model_loss_8bit[-1], local_model_loss_8bit[-1]] + + x = np.arange(len(metrics)) + width = 0.35 + + bars1 = axes[1].bar(x - width/2, values_4bit, width, label='4-bit', color='blue', alpha=0.7) + bars2 = axes[1].bar(x + width/2, values_8bit, width, label='8-bit', color='red', alpha=0.7) + + # Add value labels on bars + for bars in [bars1, bars2]: + for bar in bars: + height = bar.get_height() + axes[1].annotate(f'{height:.4f}', + xy=(bar.get_x() + bar.get_width() / 2, height), + xytext=(0, 3), + textcoords="offset points", + ha='center', va='bottom', + fontsize=9) + + axes[1].set_title('Final Metrics Comparison', fontsize=14, fontweight='bold') + axes[1].set_ylabel('Loss', fontsize=12) + axes[1].set_xticks(x) + axes[1].set_xticklabels(metrics, rotation=15) + axes[1].legend(loc='upper right', fontsize=10) + axes[1].grid(True, alpha=0.3) + + plt.tight_layout() + plt.show() + + # Print percent differences + print("\n==== Percentage Difference (8-bit vs 4-bit) ====\n") + for i, metric in enumerate(metrics): + pct_diff = ((values_8bit[i] - values_4bit[i]) / values_4bit[i]) * 100 + direction = "higher" if pct_diff > 0 else "lower" + print(f"{metric}: 8-bit is {abs(pct_diff):.2f}% {direction} than 4-bit") From b68401305ac0f1309efb4e44c897f44908b2c357 Mon Sep 17 00:00:00 2001 From: Rajith Date: Wed, 21 May 2025 16:14:13 +0530 Subject: [PATCH 32/34] removed nvidia-smi cell --- .../workflow/LLM/phi-4-withquantization.ipynb | 37 ------------------- 1 file changed, 37 deletions(-) diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-withquantization.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-withquantization.ipynb index 5e592b5873..a8c3dba3ca 100644 --- a/openfl-tutorials/experimental/workflow/LLM/phi-4-withquantization.ipynb +++ b/openfl-tutorials/experimental/workflow/LLM/phi-4-withquantization.ipynb @@ -67,43 +67,6 @@ "!pip install torch transformers peft datasets trl==0.12.2 bitsandbytes accelerate -q" ] }, - { - "cell_type": "code", - "execution_count": 2, - "id": "a7ae1a7e-8c16-4c5a-be57-33d84723aed7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Wed May 21 08:21:44 2025 \n", - "+-----------------------------------------------------------------------------------------+\n", - "| NVIDIA-SMI 550.144.03 Driver Version: 550.144.03 CUDA Version: 12.4 |\n", - "|-----------------------------------------+------------------------+----------------------+\n", - "| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n", - "| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n", - "| | | MIG M. |\n", - "|=========================================+========================+======================|\n", - "| 0 NVIDIA H100 NVL Off | 00000001:00:00.0 Off | 0 |\n", - "| N/A 31C P0 61W / 400W | 1MiB / 95830MiB | 0% Default |\n", - "| | | Disabled |\n", - "+-----------------------------------------+------------------------+----------------------+\n", - " \n", - "+-----------------------------------------------------------------------------------------+\n", - "| Processes: |\n", - "| GPU GI CI PID Type Process name GPU Memory |\n", - "| ID ID Usage |\n", - "|=========================================================================================|\n", - "| No running processes found |\n", - "+-----------------------------------------------------------------------------------------+\n" - ] - } - ], - "source": [ - "!nvidia-smi" - ] - }, { "cell_type": "markdown", "id": "440a9c39-ec42-45a5-80f6-9a9e0bc90d2f", From 17f1bdf6a8d490d76984b4f0ea4c772482b501e2 Mon Sep 17 00:00:00 2001 From: Rajith Date: Wed, 21 May 2025 16:59:42 +0530 Subject: [PATCH 33/34] fix code format --- openfl/utilities/phi_utils.py | 466 +++++++++++++++++++--------------- 1 file changed, 260 insertions(+), 206 deletions(-) diff --git a/openfl/utilities/phi_utils.py b/openfl/utilities/phi_utils.py index 1d914b8412..e1c2415ed1 100644 --- a/openfl/utilities/phi_utils.py +++ b/openfl/utilities/phi_utils.py @@ -5,13 +5,16 @@ - Visualization functions for comparing 4-bit and 8-bit quantization """ -import torch +# flake8: noqa: E501, E722 + import matplotlib.pyplot as plt +import numpy as np import pandas as pd import seaborn as sns -import numpy as np +import torch from matplotlib.ticker import EngFormatter + def get_gpu_memory_info(): """Get GPU memory usage information in MB.""" try: @@ -19,18 +22,16 @@ def get_gpu_memory_info(): allocated = torch.cuda.memory_allocated() / (1024 * 1024) reserved = torch.cuda.memory_reserved() / (1024 * 1024) max_allocated = torch.cuda.max_memory_allocated() / (1024 * 1024) - return { - "allocated": allocated, - "reserved": reserved, - "max_allocated": max_allocated - } + return {"allocated": allocated, "reserved": reserved, "max_allocated": max_allocated} else: return {"allocated": 0, "reserved": 0, "max_allocated": 0} except: return {"allocated": 0, "reserved": 0, "max_allocated": 0} + class MemoryTracker: """Track GPU memory usage during training""" + def __init__(self, collaborator_name, quant_type): self.collaborator_name = collaborator_name self.quant_type = quant_type @@ -38,51 +39,51 @@ def __init__(self, collaborator_name, quant_type): self.peak = {"allocated": 0, "reserved": 0, "max_allocated": 0} self.training_loss = None self.eval_loss = None - + def log(self, timestamp): """Log current memory usage at a specific timestamp""" self.timestamps[timestamp] = get_gpu_memory_info() - + def log_loss(self, training_loss=None, eval_loss=None): """Log training or evaluation loss""" if training_loss is not None: self.training_loss = training_loss if eval_loss is not None: self.eval_loss = eval_loss - + def update_peak(self): """Update peak memory usage values""" current = get_gpu_memory_info() self.peak["allocated"] = max(self.peak["allocated"], current["allocated"]) self.peak["reserved"] = max(self.peak["reserved"], current["reserved"]) self.peak["max_allocated"] = max(self.peak["max_allocated"], current["max_allocated"]) - + def reset_peak(self): """Reset peak memory usage values""" self.peak = {"allocated": 0, "reserved": 0, "max_allocated": 0} - + def report(self): """Print memory usage report""" print(f"\n==== Memory Usage Report for {self.collaborator_name} ({self.quant_type}) ====") - print(f"Peak Memory Usage:") + print("Peak Memory Usage:") print(f" Allocated: {self.peak['allocated']:.2f} MB") print(f" Reserved: {self.peak['reserved']:.2f} MB") print(f" Max Allocated: {self.peak['max_allocated']:.2f} MB") - + print("\nMemory Usage by Stage:") for timestamp, mem in self.timestamps.items(): print(f" {timestamp}:") print(f" Allocated: {mem['allocated']:.2f} MB") print(f" Reserved: {mem['reserved']:.2f} MB") print(f" Max Allocated: {mem['max_allocated']:.2f} MB") - + print("\nPerformance Metrics:") if self.training_loss is not None: print(f" Training Loss: {self.training_loss:.4f}") if self.eval_loss is not None: print(f" Evaluation Loss: {self.eval_loss:.4f}") print("-" * 50) - + def get_stats(self): """Get all statistics as a dictionary""" stats = { @@ -91,7 +92,7 @@ def get_stats(self): "peak_max_allocated": self.peak["max_allocated"], "quant_type": self.quant_type, "training_loss": self.training_loss, - "eval_loss": self.eval_loss + "eval_loss": self.eval_loss, } for timestamp, mem in self.timestamps.items(): stats[f"{timestamp}_allocated"] = mem["allocated"] @@ -99,164 +100,202 @@ def get_stats(self): stats[f"{timestamp}_max_allocated"] = mem["max_allocated"] return stats -def plot_memory_metrics(flow_4bit, flow_8bit): + +def plot_memory_metrics(flow_4bit, flow_8bit): # NOQA: C901 """Plot and compare memory metrics between 4-bit and 8-bit quantization.""" try: # Create figure with multiple subplots fig, axs = plt.subplots(2, 2, figsize=(16, 12)) - fig.suptitle('4-bit vs 8-bit Quantization Comparison', fontsize=16) - + fig.suptitle("4-bit vs 8-bit Quantization Comparison", fontsize=16) + # Colors for consistent plotting - colors_4bit = {'Portland': 'blue', 'Seattle': 'green'} - colors_8bit = {'Portland': 'darkblue', 'Seattle': 'darkgreen'} - markers_4bit = {'Portland': 'o', 'Seattle': 's'} - markers_8bit = {'Portland': '^', 'Seattle': 'D'} - + colors_4bit = {"Portland": "blue", "Seattle": "green"} + colors_8bit = {"Portland": "darkblue", "Seattle": "darkgreen"} + markers_4bit = {"Portland": "o", "Seattle": "s"} + markers_8bit = {"Portland": "^", "Seattle": "D"} + # Flatten the metric data for plotting memory_data = [] for quant, flow in [("4-bit", flow_4bit), ("8-bit", flow_8bit)]: stats = flow.all_memory_stats for collab, rounds_data in stats.items(): for round_name, metrics in rounds_data.items(): - round_num = int(round_name.split('_')[1]) + round_num = int(round_name.split("_")[1]) row = { - 'Collaborator': collab, - 'Round': round_num, - 'Quantization': quant, - 'Peak Memory (MB)': metrics.get('peak_max_allocated', 0), - 'Training Loss': metrics.get('training_loss', 0), - 'Eval Loss': metrics.get('eval_loss', 0) + "Collaborator": collab, + "Round": round_num, + "Quantization": quant, + "Peak Memory (MB)": metrics.get("peak_max_allocated", 0), + "Training Loss": metrics.get("training_loss", 0), + "Eval Loss": metrics.get("eval_loss", 0), } memory_data.append(row) - + df = pd.DataFrame(memory_data) - + # Plot 1: Peak Memory Usage by Round - axs[0, 0].set_title('Peak Memory Usage by Round') - for quant_type in ['4-bit', '8-bit']: - for collab in df['Collaborator'].unique(): - subset = df[(df['Quantization'] == quant_type) & (df['Collaborator'] == collab)] - color = colors_4bit[collab] if quant_type == '4-bit' else colors_8bit[collab] - marker = markers_4bit[collab] if quant_type == '4-bit' else markers_8bit[collab] - axs[0, 0].plot(subset['Round'], subset['Peak Memory (MB)'], - marker=marker, linestyle='-', label=f"{collab} ({quant_type})", - color=color) - - axs[0, 0].set_xlabel('Round') - axs[0, 0].set_ylabel('Memory (MB)') + axs[0, 0].set_title("Peak Memory Usage by Round") + for quant_type in ["4-bit", "8-bit"]: + for collab in df["Collaborator"].unique(): + subset = df[(df["Quantization"] == quant_type) & (df["Collaborator"] == collab)] + color = colors_4bit[collab] if quant_type == "4-bit" else colors_8bit[collab] + marker = markers_4bit[collab] if quant_type == "4-bit" else markers_8bit[collab] + axs[0, 0].plot( + subset["Round"], + subset["Peak Memory (MB)"], + marker=marker, + linestyle="-", + label=f"{collab} ({quant_type})", + color=color, + ) + + axs[0, 0].set_xlabel("Round") + axs[0, 0].set_ylabel("Memory (MB)") axs[0, 0].legend() axs[0, 0].grid(True, alpha=0.3) - axs[0, 0].yaxis.set_major_formatter(EngFormatter(unit='B')) - + axs[0, 0].yaxis.set_major_formatter(EngFormatter(unit="B")) + # Plot 2: Training Loss by Round - axs[0, 1].set_title('Training Loss by Round') - for quant_type in ['4-bit', '8-bit']: - for collab in df['Collaborator'].unique(): - subset = df[(df['Quantization'] == quant_type) & (df['Collaborator'] == collab)] - color = colors_4bit[collab] if quant_type == '4-bit' else colors_8bit[collab] - marker = markers_4bit[collab] if quant_type == '4-bit' else markers_8bit[collab] - axs[0, 1].plot(subset['Round'], subset['Training Loss'], - marker=marker, linestyle='-', label=f"{collab} ({quant_type})", - color=color) - - axs[0, 1].set_xlabel('Round') - axs[0, 1].set_ylabel('Loss') + axs[0, 1].set_title("Training Loss by Round") + for quant_type in ["4-bit", "8-bit"]: + for collab in df["Collaborator"].unique(): + subset = df[(df["Quantization"] == quant_type) & (df["Collaborator"] == collab)] + color = colors_4bit[collab] if quant_type == "4-bit" else colors_8bit[collab] + marker = markers_4bit[collab] if quant_type == "4-bit" else markers_8bit[collab] + axs[0, 1].plot( + subset["Round"], + subset["Training Loss"], + marker=marker, + linestyle="-", + label=f"{collab} ({quant_type})", + color=color, + ) + + axs[0, 1].set_xlabel("Round") + axs[0, 1].set_ylabel("Loss") axs[0, 1].legend() axs[0, 1].grid(True, alpha=0.3) - + # Plot 3: Eval Loss by Round - axs[1, 0].set_title('Evaluation Loss by Round') - for quant_type in ['4-bit', '8-bit']: - for collab in df['Collaborator'].unique(): - subset = df[(df['Quantization'] == quant_type) & (df['Collaborator'] == collab)] - color = colors_4bit[collab] if quant_type == '4-bit' else colors_8bit[collab] - marker = markers_4bit[collab] if quant_type == '4-bit' else markers_8bit[collab] - axs[1, 0].plot(subset['Round'], subset['Eval Loss'], - marker=marker, linestyle='-', label=f"{collab} ({quant_type})", - color=color) - - axs[1, 0].set_xlabel('Round') - axs[1, 0].set_ylabel('Loss') + axs[1, 0].set_title("Evaluation Loss by Round") + for quant_type in ["4-bit", "8-bit"]: + for collab in df["Collaborator"].unique(): + subset = df[(df["Quantization"] == quant_type) & (df["Collaborator"] == collab)] + color = colors_4bit[collab] if quant_type == "4-bit" else colors_8bit[collab] + marker = markers_4bit[collab] if quant_type == "4-bit" else markers_8bit[collab] + axs[1, 0].plot( + subset["Round"], + subset["Eval Loss"], + marker=marker, + linestyle="-", + label=f"{collab} ({quant_type})", + color=color, + ) + + axs[1, 0].set_xlabel("Round") + axs[1, 0].set_ylabel("Loss") axs[1, 0].legend() axs[1, 0].grid(True, alpha=0.3) - + # Plot 4: Memory vs Loss (bubble chart) - axs[1, 1].set_title('Memory Usage vs. Evaluation Loss') - for quant_type in ['4-bit', '8-bit']: - for collab in df['Collaborator'].unique(): - subset = df[(df['Quantization'] == quant_type) & (df['Collaborator'] == collab)] - color = colors_4bit[collab] if quant_type == '4-bit' else colors_8bit[collab] - marker = markers_4bit[collab] if quant_type == '4-bit' else markers_8bit[collab] - + axs[1, 1].set_title("Memory Usage vs. Evaluation Loss") + for quant_type in ["4-bit", "8-bit"]: + for collab in df["Collaborator"].unique(): + subset = df[(df["Quantization"] == quant_type) & (df["Collaborator"] == collab)] + color = colors_4bit[collab] if quant_type == "4-bit" else colors_8bit[collab] + marker = markers_4bit[collab] if quant_type == "4-bit" else markers_8bit[collab] + # Size proportional to round number for visual differentiation - sizes = [100 * (r+1) for r in subset['Round']] - - axs[1, 1].scatter(subset['Peak Memory (MB)'], subset['Eval Loss'], - s=sizes, alpha=0.7, - label=f"{collab} ({quant_type})", - color=color, marker=marker) - + sizes = [100 * (r + 1) for r in subset["Round"]] + + axs[1, 1].scatter( + subset["Peak Memory (MB)"], + subset["Eval Loss"], + s=sizes, + alpha=0.7, + label=f"{collab} ({quant_type})", + color=color, + marker=marker, + ) + # Add round number annotations for _, row in subset.iterrows(): - axs[1, 1].annotate(f"R{int(row['Round'])}", - (row['Peak Memory (MB)'], row['Eval Loss']), - xytext=(5, 5), textcoords='offset points') - - axs[1, 1].set_xlabel('Peak Memory (MB)') - axs[1, 1].set_ylabel('Evaluation Loss') + axs[1, 1].annotate( + f"R{int(row['Round'])}", + (row["Peak Memory (MB)"], row["Eval Loss"]), + xytext=(5, 5), + textcoords="offset points", + ) + + axs[1, 1].set_xlabel("Peak Memory (MB)") + axs[1, 1].set_ylabel("Evaluation Loss") axs[1, 1].legend() axs[1, 1].grid(True, alpha=0.3) - axs[1, 1].xaxis.set_major_formatter(EngFormatter(unit='B')) - + axs[1, 1].xaxis.set_major_formatter(EngFormatter(unit="B")) + plt.tight_layout() plt.subplots_adjust(top=0.92) plt.show() - + # Print summary comparison print("\n==== Performance Summary ====\n") # Group by quantization and compute means - summary = df.groupby('Quantization').agg({ - 'Peak Memory (MB)': 'mean', - 'Training Loss': 'mean', - 'Eval Loss': 'mean' - }).reset_index() - + summary = ( + df.groupby("Quantization") + .agg({"Peak Memory (MB)": "mean", "Training Loss": "mean", "Eval Loss": "mean"}) + .reset_index() + ) + # Calculate percentage difference - mem_diff_pct = ((summary.loc[1, 'Peak Memory (MB)'] - summary.loc[0, 'Peak Memory (MB)']) / - summary.loc[0, 'Peak Memory (MB)'] * 100) - - eval_diff_pct = ((summary.loc[1, 'Eval Loss'] - summary.loc[0, 'Eval Loss']) / - summary.loc[0, 'Eval Loss'] * 100) - - print(f"Memory Usage Comparison:") + mem_diff_pct = ( + (summary.loc[1, "Peak Memory (MB)"] - summary.loc[0, "Peak Memory (MB)"]) + / summary.loc[0, "Peak Memory (MB)"] + * 100 + ) + + eval_diff_pct = ( + (summary.loc[1, "Eval Loss"] - summary.loc[0, "Eval Loss"]) + / summary.loc[0, "Eval Loss"] + * 100 + ) + + print("Memory Usage Comparison:") print(f" 4-bit Avg: {summary.loc[0, 'Peak Memory (MB)']:.2f} MB") print(f" 8-bit Avg: {summary.loc[1, 'Peak Memory (MB)']:.2f} MB") - print(f" Difference: {abs(mem_diff_pct):.1f}% {'more' if mem_diff_pct > 0 else 'less'} memory with 8-bit") - - print(f"\nEvaluation Loss Comparison:") + print( + f" Difference: {abs(mem_diff_pct):.1f}% {'more' if mem_diff_pct > 0 else 'less'} memory with 8-bit" + ) + + print("\nEvaluation Loss Comparison:") print(f" 4-bit Avg: {summary.loc[0, 'Eval Loss']:.4f}") print(f" 8-bit Avg: {summary.loc[1, 'Eval Loss']:.4f}") - print(f" Difference: {abs(eval_diff_pct):.1f}% {'higher' if eval_diff_pct > 0 else 'lower'} loss with 8-bit") - - loss_efficiency = ((summary.loc[0, 'Eval Loss'] - summary.loc[1, 'Eval Loss']) / - (summary.loc[0, 'Peak Memory (MB)'] - summary.loc[1, 'Peak Memory (MB)'])) - + print( + f" Difference: {abs(eval_diff_pct):.1f}% {'higher' if eval_diff_pct > 0 else 'lower'} loss with 8-bit" + ) + + loss_efficiency = (summary.loc[0, "Eval Loss"] - summary.loc[1, "Eval Loss"]) / ( + summary.loc[0, "Peak Memory (MB)"] - summary.loc[1, "Peak Memory (MB)"] + ) + if loss_efficiency > 0: efficiency_msg = "8-bit provides more efficiency memory usage relative to loss" else: efficiency_msg = "4-bit provides more efficiency memory usage relative to loss" - + print(f"\nEfficiency Analysis: {efficiency_msg}") except ImportError: - print("Plotting requires matplotlib and pandas. Install with: pip install matplotlib pandas") + print( + "Plotting requires matplotlib and pandas. Install with: pip install matplotlib pandas" + ) except Exception as e: print(f"Error plotting metrics: {str(e)}") -def plot_loss_metrics(flow_4bit, flow_8bit): + +def plot_loss_metrics(flow_4bit, flow_8bit): # NOQA: C901 """Plot training and evaluation loss metrics comparing 4-bit and 8-bit quantization""" # Extract and organize loss data loss_data = [] - + # Helper function to safely convert tensor to float value def tensor_to_float(val): if val is None: @@ -264,108 +303,120 @@ def tensor_to_float(val): if isinstance(val, torch.Tensor): return val.detach().cpu().float().numpy().item() return val - + # Process 4-bit data for collab, rounds_data in flow_4bit.all_memory_stats.items(): for round_name, stats in rounds_data.items(): - round_num = int(round_name.split('_')[1]) if '_' in round_name else 0 + round_num = int(round_name.split("_")[1]) if "_" in round_name else 0 quant_type = stats.get("quant_type", "4bit") training_loss = tensor_to_float(stats.get("training_loss")) eval_loss = tensor_to_float(stats.get("eval_loss")) - + if training_loss is not None or eval_loss is not None: - loss_data.append({ - "Collaborator": collab, - "Round": round_name, - "Round Number": round_num, - "Training Loss": training_loss, - "Eval Loss": eval_loss, - "Quantization": quant_type - }) - + loss_data.append( + { + "Collaborator": collab, + "Round": round_name, + "Round Number": round_num, + "Training Loss": training_loss, + "Eval Loss": eval_loss, + "Quantization": quant_type, + } + ) + # Process 8-bit data if provided if flow_8bit is not None: for collab, rounds_data in flow_8bit.all_memory_stats.items(): for round_name, stats in rounds_data.items(): - round_num = int(round_name.split('_')[1]) if '_' in round_name else 0 + round_num = int(round_name.split("_")[1]) if "_" in round_name else 0 quant_type = stats.get("quant_type", "8bit") training_loss = tensor_to_float(stats.get("training_loss")) eval_loss = tensor_to_float(stats.get("eval_loss")) - + if training_loss is not None or eval_loss is not None: - loss_data.append({ - "Collaborator": collab, - "Round": round_name, - "Round Number": round_num, - "Training Loss": training_loss, - "Eval Loss": eval_loss, - "Quantization": quant_type - }) - + loss_data.append( + { + "Collaborator": collab, + "Round": round_name, + "Round Number": round_num, + "Training Loss": training_loss, + "Eval Loss": eval_loss, + "Quantization": quant_type, + } + ) + loss_df = pd.DataFrame(loss_data) - + # Create a figure with subplots for loss metrics - fig, axes = plt.subplots(2, 1, figsize=(15, 12), gridspec_kw={'height_ratios': [1, 1]}) - + fig, axes = plt.subplots(2, 1, figsize=(15, 12), gridspec_kw={"height_ratios": [1, 1]}) + # 1. Training loss across rounds (top plot) group_var = "Quantization" if flow_8bit else "Collaborator" - + sns.lineplot( - x="Round Number", - y="Training Loss", - hue=group_var, - data=loss_df, - marker='o', + x="Round Number", + y="Training Loss", + hue=group_var, + data=loss_df, + marker="o", sort=True, linewidth=3, markersize=10, - ax=axes[0] + ax=axes[0], ) - axes[0].set_title("Training Loss Across Rounds", fontsize=14, fontweight='bold') + axes[0].set_title("Training Loss Across Rounds", fontsize=14, fontweight="bold") axes[0].set_xlabel("Round", fontsize=12) axes[0].set_ylabel("Loss", fontsize=12) - axes[0].legend(title=group_var, bbox_to_anchor=(1.05, 1), loc='upper left') - + axes[0].legend(title=group_var, bbox_to_anchor=(1.05, 1), loc="upper left") + # 2. Evaluation loss across rounds (bottom plot) sns.lineplot( - x="Round Number", - y="Eval Loss", - hue=group_var, - data=loss_df, - marker='o', + x="Round Number", + y="Eval Loss", + hue=group_var, + data=loss_df, + marker="o", sort=True, linewidth=3, markersize=10, - ax=axes[1] + ax=axes[1], ) - axes[1].set_title("Evaluation Loss Across Rounds", fontsize=14, fontweight='bold') + axes[1].set_title("Evaluation Loss Across Rounds", fontsize=14, fontweight="bold") axes[1].set_xlabel("Round", fontsize=12) axes[1].set_ylabel("Loss", fontsize=12) - axes[1].legend(title=group_var, bbox_to_anchor=(1.05, 1), loc='upper left') - + axes[1].legend(title=group_var, bbox_to_anchor=(1.05, 1), loc="upper left") + plt.tight_layout() plt.show() - + # Print summary statistics if flow_8bit: print("\n==== Loss Comparison: 4-bit vs 8-bit ====\n") - + # Group by quantization and compute means - summary = loss_df.groupby('Quantization').agg({ - 'Training Loss': ['mean', 'std'], - 'Eval Loss': ['mean', 'std'] - }) - - print(f"Training Loss (4-bit): {summary.loc['4bit', ('Training Loss', 'mean')]:.4f} ± {summary.loc['4bit', ('Training Loss', 'std')]:.4f}") - print(f"Training Loss (8-bit): {summary.loc['8bit', ('Training Loss', 'mean')]:.4f} ± {summary.loc['8bit', ('Training Loss', 'std')]:.4f}") - print(f"\nEval Loss (4-bit): {summary.loc['4bit', ('Eval Loss', 'mean')]:.4f} ± {summary.loc['4bit', ('Eval Loss', 'std')]:.4f}") - print(f"Eval Loss (8-bit): {summary.loc['8bit', ('Eval Loss', 'mean')]:.4f} ± {summary.loc['8bit', ('Eval Loss', 'std')]:.4f}") + summary = loss_df.groupby("Quantization").agg( + {"Training Loss": ["mean", "std"], "Eval Loss": ["mean", "std"]} + ) + + print( + f"Training Loss (4-bit): {summary.loc['4bit', ('Training Loss', 'mean')]:.4f} ± {summary.loc['4bit', ('Training Loss', 'std')]:.4f}" + ) + print( + f"Training Loss (8-bit): {summary.loc['8bit', ('Training Loss', 'mean')]:.4f} ± {summary.loc['8bit', ('Training Loss', 'std')]:.4f}" + ) + print( + f"\nEval Loss (4-bit): {summary.loc['4bit', ('Eval Loss', 'mean')]:.4f} ± {summary.loc['4bit', ('Eval Loss', 'std')]:.4f}" + ) + print( + f"Eval Loss (8-bit): {summary.loc['8bit', ('Eval Loss', 'mean')]:.4f} ± {summary.loc['8bit', ('Eval Loss', 'std')]:.4f}" + ) + def plot_aggregated_metrics(flow_4bit, flow_8bit): """Plot aggregated metrics comparing 4-bit and 8-bit quantization""" # Create a figure with subplots for aggregated metrics fig, axes = plt.subplots(1, 2, figsize=(16, 6)) - + # Helper function to safely convert tensor to float value def tensor_to_float(val): if val is None: @@ -373,7 +424,7 @@ def tensor_to_float(val): if isinstance(val, torch.Tensor): return val.detach().cpu().float().numpy().item() return val - + # Convert any tensor values to CPU before plotting loss_history_4bit = [tensor_to_float(x) for x in flow_4bit.average_loss_history] loss_history_8bit = [tensor_to_float(x) for x in flow_8bit.average_loss_history] @@ -381,51 +432,54 @@ def tensor_to_float(val): agg_model_loss_8bit = [tensor_to_float(x) for x in flow_8bit.agg_model_loss_history] local_model_loss_4bit = [tensor_to_float(x) for x in flow_4bit.local_model_loss_history] local_model_loss_8bit = [tensor_to_float(x) for x in flow_8bit.local_model_loss_history] - + # Setup data rounds = list(range(len(loss_history_4bit))) - + # Plot average loss history - axes[0].plot(rounds, loss_history_4bit, 'bo-', linewidth=2, markersize=8, label='4-bit') - axes[0].plot(rounds, loss_history_8bit, 'ro-', linewidth=2, markersize=8, label='8-bit') - axes[0].set_title('Average Training Loss by Round', fontsize=14, fontweight='bold') - axes[0].set_xlabel('Round', fontsize=12) - axes[0].set_ylabel('Loss', fontsize=12) + axes[0].plot(rounds, loss_history_4bit, "bo-", linewidth=2, markersize=8, label="4-bit") + axes[0].plot(rounds, loss_history_8bit, "ro-", linewidth=2, markersize=8, label="8-bit") + axes[0].set_title("Average Training Loss by Round", fontsize=14, fontweight="bold") + axes[0].set_xlabel("Round", fontsize=12) + axes[0].set_ylabel("Loss", fontsize=12) axes[0].grid(True, alpha=0.3) axes[0].legend(fontsize=10) - + # Plot final metrics comparison - metrics = ['Avg Training Loss', 'Agg Model Loss', 'Local Model Loss'] + metrics = ["Avg Training Loss", "Agg Model Loss", "Local Model Loss"] values_4bit = [loss_history_4bit[-1], agg_model_loss_4bit[-1], local_model_loss_4bit[-1]] values_8bit = [loss_history_8bit[-1], agg_model_loss_8bit[-1], local_model_loss_8bit[-1]] - + x = np.arange(len(metrics)) width = 0.35 - - bars1 = axes[1].bar(x - width/2, values_4bit, width, label='4-bit', color='blue', alpha=0.7) - bars2 = axes[1].bar(x + width/2, values_8bit, width, label='8-bit', color='red', alpha=0.7) - + + bars1 = axes[1].bar(x - width / 2, values_4bit, width, label="4-bit", color="blue", alpha=0.7) + bars2 = axes[1].bar(x + width / 2, values_8bit, width, label="8-bit", color="red", alpha=0.7) + # Add value labels on bars for bars in [bars1, bars2]: for bar in bars: height = bar.get_height() - axes[1].annotate(f'{height:.4f}', - xy=(bar.get_x() + bar.get_width() / 2, height), - xytext=(0, 3), - textcoords="offset points", - ha='center', va='bottom', - fontsize=9) - - axes[1].set_title('Final Metrics Comparison', fontsize=14, fontweight='bold') - axes[1].set_ylabel('Loss', fontsize=12) + axes[1].annotate( + f"{height:.4f}", + xy=(bar.get_x() + bar.get_width() / 2, height), + xytext=(0, 3), + textcoords="offset points", + ha="center", + va="bottom", + fontsize=9, + ) + + axes[1].set_title("Final Metrics Comparison", fontsize=14, fontweight="bold") + axes[1].set_ylabel("Loss", fontsize=12) axes[1].set_xticks(x) axes[1].set_xticklabels(metrics, rotation=15) - axes[1].legend(loc='upper right', fontsize=10) + axes[1].legend(loc="upper right", fontsize=10) axes[1].grid(True, alpha=0.3) - + plt.tight_layout() plt.show() - + # Print percent differences print("\n==== Percentage Difference (8-bit vs 4-bit) ====\n") for i, metric in enumerate(metrics): From b1f48fcfd445d0511a525cbc5ee0bdfcc9b8cb6b Mon Sep 17 00:00:00 2001 From: Rajith Date: Wed, 21 May 2025 17:14:58 +0530 Subject: [PATCH 34/34] renamed file name --- ...phi-4-withquantization.ipynb => phi-4-peft-quantization.ipynb} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename openfl-tutorials/experimental/workflow/LLM/{phi-4-withquantization.ipynb => phi-4-peft-quantization.ipynb} (100%) diff --git a/openfl-tutorials/experimental/workflow/LLM/phi-4-withquantization.ipynb b/openfl-tutorials/experimental/workflow/LLM/phi-4-peft-quantization.ipynb similarity index 100% rename from openfl-tutorials/experimental/workflow/LLM/phi-4-withquantization.ipynb rename to openfl-tutorials/experimental/workflow/LLM/phi-4-peft-quantization.ipynb