Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
294 changes: 242 additions & 52 deletions classad/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,52 @@ package classad

import (
"bufio"
"fmt"
"io"
"iter"
"strings"
"unicode/utf8"
)

const (
// maxBufferSize limits the buffer size to prevent unbounded memory growth
// when processing malformed or very large inputs
maxBufferSize = 10 * 1024 * 1024 // 10MB
// readChunkSize is the size of chunks read from the io.Reader
readChunkSize = 4096 // 4KB
)

// Reader provides an iterator for parsing multiple ClassAds from an io.Reader.
// It supports both new-style (bracketed) and old-style (newline-delimited) formats.
type Reader struct {
scanner *bufio.Scanner
oldStyle bool
err error
current *ClassAd
// For new-style (bracketed) ClassAds
bufReader *bufio.Reader
buffer strings.Builder
oldStyle bool
scanner *bufio.Scanner
err error
current *ClassAd
}

// NewReader creates a new Reader for parsing new-style ClassAds (with brackets).
// Each ClassAd should be on its own, separated by whitespace or comments.
// This function natively supports concatenated ClassAds (e.g., "][") through
// grammar-level parsing.
// Example format:
//
// [Foo = 1; Bar = 2]
// [Baz = 3; Qux = 4]
//
// Also supports concatenated format:
//
// [Foo = 1; Bar = 2][Baz = 3; Qux = 4]
//
// This implementation streams data from the io.Reader, processing ClassAds
// one at a time without buffering the entire input in memory.
func NewReader(r io.Reader) *Reader {
return &Reader{
scanner: bufio.NewScanner(r),
oldStyle: false,
bufReader: bufio.NewReader(r),
oldStyle: false,
}
}

Expand Down Expand Up @@ -59,71 +81,239 @@ func (r *Reader) Next() bool {
return r.nextNew()
}

// nextNew reads the next new-style ClassAd (with brackets)
// nextNew reads the next new-style ClassAd by streaming from the reader.
// It tracks bracket depth to detect complete ClassAds, handling strings
// and comments properly.
func (r *Reader) nextNew() bool {
var lines []string
inClassAd := false
bracketDepth := 0
// Read data incrementally until we have a complete ClassAd
for {
// Check buffer size limit before expensive scan to prevent unbounded growth
if r.buffer.Len() > maxBufferSize {
r.err = fmt.Errorf("buffer exceeded maximum size (%d bytes): input may be malformed or too large", maxBufferSize)
return false
}

for r.scanner.Scan() {
line := strings.TrimSpace(r.scanner.Text())
// Check if we already have a complete ClassAd in the buffer
adStr, remaining, found := r.findCompleteClassAd()
if found {
// Parse the complete ClassAd
ad, err := Parse(adStr)
if err != nil {
r.err = err
return false
}
r.current = ad
// Update buffer with remaining data
r.buffer.Reset()
r.buffer.WriteString(remaining)
return true
}

// Skip empty lines and comments outside of ClassAds
if !inClassAd && (line == "" || strings.HasPrefix(line, "//") || strings.HasPrefix(line, "/*")) {
continue
// Need more data - read a chunk
chunk := make([]byte, readChunkSize)
n, err := r.bufReader.Read(chunk)
if n > 0 {
r.buffer.Write(chunk[:n])
// Check buffer size after writing to catch cases where a single chunk exceeds limit
if r.buffer.Len() > maxBufferSize {
r.err = fmt.Errorf("buffer exceeded maximum size (%d bytes): input may be malformed or too large", maxBufferSize)
return false
}
}
if err == io.EOF {
return r.handleEOF()
}
if err != nil {
r.err = err
return false
}
}
}

// Check if this line starts a ClassAd
if !inClassAd && strings.HasPrefix(line, "[") {
inClassAd = true
// handleEOF processes remaining data when EOF is reached.
// It attempts to parse any complete ClassAd or remaining data in the buffer.
func (r *Reader) handleEOF() bool {
// Check if we have a complete ClassAd in buffer
adStr, remaining, found := r.findCompleteClassAd()
if found {
ad, parseErr := Parse(adStr)
if parseErr != nil {
r.err = parseErr
return false
}
r.current = ad
r.buffer.Reset()
r.buffer.WriteString(remaining)
return true
}
// Check if there's any remaining data that might be a ClassAd
remainingStr := strings.TrimSpace(r.buffer.String())
if remainingStr != "" {
// Try to parse what's left
ad, parseErr := Parse(remainingStr)
if parseErr != nil {
r.err = parseErr
return false
}
r.current = ad
r.buffer.Reset()
return true
}
return false
}

if inClassAd {
lines = append(lines, line)
// findCompleteClassAd scans the buffer to find a complete ClassAd (balanced brackets).
// It returns the ClassAd string, any remaining data, and whether a complete ClassAd was found.
// This handles strings and comments properly so brackets inside them don't affect depth.
// The function uses byte-level iteration for efficiency, properly handling UTF-8 sequences.
func (r *Reader) findCompleteClassAd() (classAdStr, remaining string, found bool) {
bufStr := r.buffer.String()
if bufStr == "" {
return "", "", false
}

// Count brackets to handle nested ClassAds
for _, ch := range line {
switch ch {
case '[':
bracketDepth++
case ']':
bracketDepth--
// Track bracket depth, handling strings and comments
depth := 0
inString := false
inLineComment := false
inBlockComment := false
escapeNext := false
startPos := -1

// Skip leading whitespace and comments to find the start of a ClassAd
skipWhitespace := true

// Use byte-level iteration for efficiency (brackets are ASCII, single-byte)
// But properly handle UTF-8 sequences when advancing
for i := 0; i < len(bufStr); {
ch := bufStr[i]

// Skip whitespace before finding the first bracket
if skipWhitespace {
if ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' {
i++
continue
}
// Check for line comment
if i+1 < len(bufStr) && ch == '/' && bufStr[i+1] == '/' {
// Skip to end of line
for i < len(bufStr) && bufStr[i] != '\n' {
i++
}
continue
}
// Check for block comment
if i+1 < len(bufStr) && ch == '/' && bufStr[i+1] == '*' {
// Skip block comment
i += 2
for i+1 < len(bufStr) {
if bufStr[i] == '*' && bufStr[i+1] == '/' {
i += 2
break
}
// Advance by rune to handle UTF-8 in comments
_, size := utf8.DecodeRuneInString(bufStr[i:])
if size == 0 {
break
}
i += size
}
continue
}
skipWhitespace = false
}

// If we've closed all brackets, we have a complete ClassAd
if bracketDepth == 0 {
classAdStr := strings.Join(lines, "\n")
ad, err := Parse(classAdStr)
if err != nil {
r.err = err
return false
// Handle escape sequences in strings
if escapeNext {
escapeNext = false
// Advance by rune to handle UTF-8 escape sequences properly
_, size := utf8.DecodeRuneInString(bufStr[i:])
if size == 0 {
break
}
i += size
continue
}

// Handle escape sequences in strings
if inString && ch == '\\' {
escapeNext = true
i++
continue
}

// Handle strings
if !inLineComment && !inBlockComment {
if ch == '"' {
inString = !inString
i++
continue
}
}

// Only process brackets when not in string or comment
// Brackets are ASCII (single-byte), so byte-level comparison is safe
if !inString && !inLineComment && !inBlockComment {
switch ch {
case '[':
if depth == 0 {
startPos = i
}
r.current = ad
return true
depth++
i++
case ']':
depth--
if depth == 0 && startPos >= 0 {
// Found complete ClassAd
classAdStr = bufStr[startPos : i+1]
remaining = strings.TrimSpace(bufStr[i+1:])
return classAdStr, remaining, true
}
i++
default:
// Not a bracket - advance by rune for UTF-8 handling
_, size := utf8.DecodeRuneInString(bufStr[i:])
if size == 0 {
break
}
i += size
}
continue
}
}

// Check for scanner errors
if err := r.scanner.Err(); err != nil {
r.err = err
return false
}
// Handle comments (only when not in string)
if !inString {
if !inBlockComment && i+1 < len(bufStr) && ch == '/' && bufStr[i+1] == '/' {
inLineComment = true
i += 2
continue
}
if inLineComment && ch == '\n' {
inLineComment = false
i++
continue
}
if !inLineComment && !inBlockComment && i+1 < len(bufStr) && ch == '/' && bufStr[i+1] == '*' {
inBlockComment = true
i += 2
continue
}
if inBlockComment && i+1 < len(bufStr) && ch == '*' && bufStr[i+1] == '/' {
inBlockComment = false
i += 2
continue
}
}

// If we have accumulated lines but hit EOF, try to parse them
if len(lines) > 0 {
classAdStr := strings.Join(lines, "\n")
ad, err := Parse(classAdStr)
if err != nil {
r.err = err
return false
// Advance by rune to handle UTF-8 properly in strings and comments
_, size := utf8.DecodeRuneInString(bufStr[i:])
if size == 0 {
break
}
r.current = ad
return true
i += size
}

return false
return "", "", false
}

// nextOld reads the next old-style ClassAd (newline-delimited, separated by blank lines)
Expand Down
Loading
Loading