Skip to content

Unit tests that aim to verify the behavior and correctness of the sampling pipeline under various conditions #1107

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 35 commits into
base: master
Choose a base branch
from
Draft
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
133d979
Added Unit tests for the Sampling Pipeline
csharpconsultant Feb 17, 2025
68e989c
Added Unit test for seed initialization on the default sampling pipeline
csharpconsultant Feb 17, 2025
b1347ca
Merge pull request #1 from ppaing2002/TonyDev
csharpconsultant Feb 17, 2025
34e7693
Added Unit test for seed initialization on Default sampling pipeline …
csharpconsultant Feb 17, 2025
29c1b3e
Merge branch 'SciSharp:master' into master
csharpconsultant Feb 17, 2025
3373304
Added Unit test MinKeep Setter for the Default Sampling Pipeline
csharpconsultant Feb 18, 2025
badd399
Merge pull request #2 from ppaing2002/TonyDev
csharpconsultant Feb 18, 2025
2eaa1fd
Added Unit test MinKeep Test Default Get Default Sampling Pipeline
csharpconsultant Feb 18, 2025
0d544fe
Merge branch 'master' into TonyDev
csharpconsultant Feb 18, 2025
0727738
Merge branch 'SciSharp:master' into master
csharpconsultant Feb 18, 2025
dc43d80
Merge branch 'master' into TonyDev
csharpconsultant Feb 18, 2025
10c2322
Merge pull request #3 from ppaing2002/TonyDev
csharpconsultant Feb 18, 2025
43d0b15
Merge branch 'master' into TonyDev
csharpconsultant Feb 18, 2025
4dc4ced
Sampling tests
csharpconsultant Feb 21, 2025
a5e58a8
Merge pull request #4 from ppaing2002/TonyDev
csharpconsultant Feb 21, 2025
9b16ab8
Merge branch 'SciSharp:master' into master
csharpconsultant Feb 21, 2025
e956141
Addressed reviewer comments and updated with new changes
csharpconsultant Feb 21, 2025
703a2f4
Merge pull request #5 from ppaing2002/TonyDev
csharpconsultant Feb 21, 2025
38e30a7
Sampling tests
csharpconsultant Feb 21, 2025
a8cdcbc
Addressed reviewer comments and updated with new changes
csharpconsultant Feb 21, 2025
6c16786
InteractiveExecutor now stops at EOS tokens again
dpmm99 Feb 10, 2025
a268c29
Merge branch 'SciSharp:master' into TonyDev
csharpconsultant Feb 21, 2025
0fd8058
Removed sampling with temperature test
csharpconsultant Feb 21, 2025
17982a9
Merge branch 'TonyDev' of https://github.com/ppaing2002/LLamaSharp-EC…
csharpconsultant Feb 21, 2025
457161f
Resolved merge conflict in LLama.Unittest/SamplingTests.cs
csharpconsultant Feb 21, 2025
151d985
Merge pull request #6 from ppaing2002/TonyDev
csharpconsultant Feb 21, 2025
30727ea
Merge branch 'SciSharp:master' into unitPhone
ppaing2002 Feb 23, 2025
1d423b1
Adding LLama.Rag project to solution for implementation of RAG
seth2396 Feb 23, 2025
cacecdd
Merge pull request #7 from ppaing2002/SethDev
csharpconsultant Feb 23, 2025
63adef2
Created an interface IWebScraper and implemented it to make web scrap…
csharpconsultant Feb 24, 2025
5710e03
Merge pull request #8 from ppaing2002/TonyDev
seth2396 Feb 27, 2025
0ad7503
unit_tests
ppaing2002 Mar 17, 2025
a2dff14
unit test for project
adi725e Apr 19, 2025
8575a99
Merge pull request #9 from ppaing2002/adi_test1
ppaing2002 May 9, 2025
41c81bc
Merge pull request #10 from ppaing2002/unitPhone
ppaing2002 May 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions LLama.Rag/IWebScraper.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
using System.Collections.Generic;
using System.Threading.Tasks;
using HtmlAgilityPack;

namespace LLama.Rag
{
public interface IWebScraper
{
HashSet<string> VisitedUrls { get; }
List<HtmlDocument> Documents { get; }

Task<List<string>> ExtractVisibleTextAsync(int minWordLength, bool checkSentences, bool explodeParagraphs);
Task<List<string>> ExtractParagraphsAsync();
}
}
14 changes: 14 additions & 0 deletions LLama.Rag/LLama.Rag.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net8.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<OutputType>Exe</OutputType>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="HtmlAgilityPack" Version="1.11.72" />
</ItemGroup>

</Project>
53 changes: 53 additions & 0 deletions LLama.Rag/Rag.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
using System;
using System.Threading.Tasks;

namespace LLama.Rag
{
public class Rag
{
public static async Task Main(string[] args)
{
try
{
Console.WriteLine("Initializing WebScraper...");

string startUrl = "https://en.wikipedia.org/wiki/Aluminium_alloy";
int depth = 0; // Scrape only the provided webpage and no links.
int minWordLength = 4; // Minimum word count for a text block to be extracted.
bool checkSentences = false;
bool explodeParagraphs = true;

WebScraper webScraper = await WebScraper.CreateAsync(startUrl, depth);

Console.WriteLine("WebScraper initialized successfully.");
Console.WriteLine("Extracting visible text...");

var documentText = webScraper.ExtractVisibleTextAsync(minWordLength, checkSentences, explodeParagraphs);

Console.WriteLine($"Extracted {documentText.Result.Count} blocks of text.");

if (documentText.Result.Count == 0)
{
Console.WriteLine("Warning: No text was extracted. Try lowering minWordLength or changing extraction settings.");
}

foreach (string text in documentText.Result)
{
Console.WriteLine("Extracted Block:");
Console.WriteLine(text);
Console.WriteLine(""); // Space between blocks for readability
}

Console.WriteLine("Scraping complete.");
}
catch (Exception ex)
{
Console.WriteLine($"An error occurred: {ex.Message}");
Console.WriteLine($"StackTrace: {ex.StackTrace}");
}

Console.WriteLine("Press any key to exit...");
Console.ReadKey();
}
}
}
144 changes: 144 additions & 0 deletions LLama.Rag/WebScraper.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net.Http;
using System.Threading.Tasks;
using System.Text.RegularExpressions;
using HtmlAgilityPack;
using System.Web;

namespace LLama.Rag
{
class WebScraper : IWebScraper
{
private static readonly HttpClient httpClient = new HttpClient();
public HashSet<string> VisitedUrls { get; } = new HashSet<string>();
public List<HtmlDocument> Documents { get; } = new List<HtmlDocument>();

private WebScraper() { }

public static async Task<WebScraper> CreateAsync(string url, int queryDepth)
{
WebScraper instance = new WebScraper();
await instance.FetchContentAsynch(url, queryDepth);
return instance;
}

private async Task FetchContentAsynch(string url, int queryDepth)
{
if (queryDepth < 0 || VisitedUrls.Contains(url)) return;

try
{
VisitedUrls.Add(url);
string pageContent = await httpClient.GetStringAsync(url);
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(pageContent);
Documents.Add(doc);

if (queryDepth > 0)
{
var links = ExtractLinks(doc, url);
var tasks = links.Select(link => FetchContentAsynch(link, queryDepth - 1));
await Task.WhenAll(tasks);
}
}
catch (Exception ex)
{
Console.WriteLine($"Error scraping {url}: {ex.Message}");
}
}

private static List<string> ExtractLinks(HtmlDocument doc, string baseUrl)
{
return doc.DocumentNode
.SelectNodes("//body//a[@href]")?
.Select(node => node.GetAttributeValue("href", ""))
.Where(href => !string.IsNullOrEmpty(href))
.Select(href => NormalizeUrl(href, baseUrl))
.Where(link => link != null)
.Distinct()
.ToList() ?? new List<string>();
}

private static string NormalizeUrl(string href, string baseUrl)
{
if (href.StartsWith("http", StringComparison.OrdinalIgnoreCase))
return href;

if (href.StartsWith("/"))
return new Uri(new Uri(baseUrl), href).ToString();

return null;
}

public async Task<List<string>> ExtractVisibleTextAsync(int minWordLength, bool checkSentences, bool explodeParagraphs)
{
return await Task.Run(() =>
{
List<string> allDocumentText = new List<string>();
foreach (HtmlDocument doc in Documents)
{
var currentDocText = doc.DocumentNode
.SelectNodes("//body//*[not(ancestor::table) and not(self::script or self::style)] | //body//a[not(self::script or self::style)]")?
.Select(node =>
{
string cleanedText = HtmlEntity.DeEntitize(node.InnerText.Trim());
cleanedText = cleanedText.Replace("\t", " ");
cleanedText = Regex.Replace(cleanedText, @"\s+", " ");
return cleanedText;
})
.Where(text => !string.IsNullOrWhiteSpace(text) && text.Split(' ').Length >= minWordLength)
.ToList() ?? new List<string>();

allDocumentText.AddRange(currentDocText);
}

if (explodeParagraphs) allDocumentText = ExplodeParagraphs(allDocumentText, minWordLength);
if (checkSentences) allDocumentText = RudimentarySentenceCheck(allDocumentText);
return allDocumentText;
});
}

public async Task<List<string>> ExtractParagraphsAsync()
{
return await Task.Run(() =>
{
List<string> paragraphs = new List<string>();
foreach (HtmlDocument doc in Documents)
{
var currentDocParagraph = doc.DocumentNode
.SelectNodes("//p//text()")?
.Select(node => HtmlEntity.DeEntitize(node.InnerText.Trim()))
.Where(text => !string.IsNullOrWhiteSpace(text))
.ToList() ?? new List<string>();

paragraphs.AddRange(currentDocParagraph);
}
return paragraphs;
});
}

private static List<string> RudimentarySentenceCheck(List<string> sentences)
{
List<Regex> sentenceRules = new List<Regex>
{
new Regex(@"^[A-Za-z0-9]+[\w\s,;:'""-]*", RegexOptions.Compiled | RegexOptions.IgnoreCase),
new Regex(@"[^\W]{2,}", RegexOptions.Compiled),
new Regex(@"\b(\w*:?[/\w\d]+\.){2,}\d+\b", RegexOptions.Compiled)
};

return sentences.Where(sentence => sentenceRules.All(regex => regex.IsMatch(sentence))).ToList();
}

private static List<string> ExplodeParagraphs(List<string> paragraphs, int minWordLength)
{
return paragraphs
.SelectMany(paragraph =>
Regex.Matches(paragraph, @"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\s|^)([A-Z0-9][^.!?]*[.!?])")
.Cast<Match>()
.Select(m => m.Value.Trim()))
.ToList();
}
}
}
31 changes: 31 additions & 0 deletions LLama.Unittest/ChatSessionTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
using LLama.Common;
using Xunit.Abstractions;

namespace LLama.Unittest
{
public sealed class ChatSessionTests
: IDisposable
{
private readonly ITestOutputHelper _testOutputHelper;
private readonly ModelParams _params;
private readonly LLamaWeights _model;

public ChatSessionTests(ITestOutputHelper testOutputHelper)
{
_testOutputHelper = testOutputHelper;
_params = new ModelParams(Constants.GenerativeModelPath2)
{
ContextSize = 128,
GpuLayerCount = Constants.CIGpuLayerCount
};
_model = LLamaWeights.LoadFromFile(_params);
}

public void Dispose()
{
_model.Dispose();
}


}
}
63 changes: 63 additions & 0 deletions LLama.Unittest/GrammarTest.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
using Xunit;

namespace LLama.Sampling.Tests
{
public class GrammarTests
{
[Fact]
public void Constructor_SetsPropertiesCorrectly()
{
// Arrange
var gbnf = "test_gbnf";
var root = "test_root";

// Act
var grammar = new Grammar(gbnf, root);

// Assert
Assert.Equal(gbnf, grammar.Gbnf);
Assert.Equal(root, grammar.Root);
}

[Fact]
public void ToString_ReturnsExpectedString()
{
// Arrange
var gbnf = "test_gbnf";
var root = "test_root";
var grammar = new Grammar(gbnf, root);

// Act
var toString = grammar.ToString();

// Assert
Assert.Equal($"Grammar {{ Gbnf = {gbnf}, Root = {root} }}", toString);
}

[Fact]
public void Equality_ChecksPropertiesCorrectly()
{
// Arrange
var gbnf = "test_gbnf";
var root = "test_root";
var grammar1 = new Grammar(gbnf, root);
var grammar2 = new Grammar(gbnf, root);

// Act and Assert
Assert.Equal(grammar1, grammar2);
}

[Fact]
public void Inequality_ChecksPropertiesCorrectly()
{
// Arrange
var gbnf = "test_gbnf";
var root = "test_root";
var grammar1 = new Grammar(gbnf, root);
var grammar2 = new Grammar("different_gbnf", root);

// Act and Assert
Assert.NotEqual(grammar1, grammar2);
}
}
}
Loading