diff --git a/cmd/api/src/api/v2/file_uploads_integration_test.go b/cmd/api/src/api/v2/file_uploads_integration_test.go index f6a15e04d6..64565a33cc 100644 --- a/cmd/api/src/api/v2/file_uploads_integration_test.go +++ b/cmd/api/src/api/v2/file_uploads_integration_test.go @@ -27,11 +27,10 @@ import ( "net/http" "testing" - "github.com/specterops/bloodhound/mediatypes" - "github.com/specterops/bloodhound/src/services/fileupload" - "github.com/specterops/bloodhound/headers" + "github.com/specterops/bloodhound/mediatypes" "github.com/specterops/bloodhound/src/api/v2/integration" + "github.com/specterops/bloodhound/src/services/fileupload" "github.com/specterops/bloodhound/src/test/fixtures/fixtures" "github.com/stretchr/testify/assert" ) @@ -170,7 +169,7 @@ func Test_FileUploadWorkFlowVersion5(t *testing.T) { "v5/ingest/sessions.json", }) - //Assert that we created stuff we expected + // Assert that we created stuff we expected testCtx.AssertIngest(fixtures.IngestAssertions) } @@ -189,7 +188,7 @@ func Test_FileUploadWorkFlowVersion6(t *testing.T) { "v6/ingest/sessions.json", }) - //Assert that we created stuff we expected + // Assert that we created stuff we expected testCtx.AssertIngest(fixtures.IngestAssertions) testCtx.AssertIngest(fixtures.IngestAssertionsv6) testCtx.AssertIngest(fixtures.PropertyAssertions) @@ -240,7 +239,7 @@ func Test_CompressedFileUploadWorkFlowVersion5(t *testing.T) { "v5/ingest/sessions.json", }) - //Assert that we created stuff we expected + // Assert that we created stuff we expected testCtx.AssertIngest(fixtures.IngestAssertions) testCtx.AssertIngest(fixtures.PropertyAssertions) } @@ -260,7 +259,7 @@ func Test_CompressedFileUploadWorkFlowVersion6(t *testing.T) { "v6/ingest/sessions.json", }) - //Assert that we created stuff we expected + // Assert that we created stuff we expected testCtx.AssertIngest(fixtures.IngestAssertions) testCtx.AssertIngest(fixtures.IngestAssertionsv6) testCtx.AssertIngest(fixtures.PropertyAssertions) diff --git a/cmd/api/src/go.mod b/cmd/api/src/go.mod index 912348a254..63721cb755 100644 --- a/cmd/api/src/go.mod +++ b/cmd/api/src/go.mod @@ -39,7 +39,7 @@ require ( github.com/pquerna/otp v1.4.0 github.com/prometheus/client_golang v1.16.0 github.com/russellhaering/goxmldsig v1.4.0 - github.com/stretchr/testify v1.8.4 + github.com/stretchr/testify v1.9.0 github.com/unrolled/secure v1.13.0 github.com/zenazn/goji v1.0.1 go.uber.org/mock v0.2.0 @@ -79,7 +79,7 @@ require ( github.com/prometheus/procfs v0.11.0 // indirect github.com/rivo/uniseg v0.4.4 // indirect golang.org/x/sys v0.21.0 // indirect - golang.org/x/text v0.16.0 // indirect + golang.org/x/text v0.17.0 // indirect golang.org/x/time v0.3.0 // indirect google.golang.org/protobuf v1.34.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/cmd/api/src/go.sum b/cmd/api/src/go.sum index f7881c9eb7..47f43e156e 100644 --- a/cmd/api/src/go.sum +++ b/cmd/api/src/go.sum @@ -208,8 +208,7 @@ github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.4/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= -github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/unrolled/secure v1.13.0 h1:sdr3Phw2+f8Px8HE5sd1EHdj1aV3yUwed/uZXChLFsk= github.com/unrolled/secure v1.13.0/go.mod h1:BmF5hyM6tXczk3MpQkFf1hpKSRqCyhqcbiQtiAF7+40= github.com/zenazn/goji v0.9.0/go.mod h1:7S9M489iMyHBNxwZnk9/EHS098H4/F6TATF2mIxtB1Q= @@ -250,7 +249,7 @@ golang.org/x/net v0.0.0-20190813141303-74dc4d7220e7/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -274,8 +273,7 @@ golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.4/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= -golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= +golang.org/x/text v0.17.0 h1:XtiM5bkSOt+ewxlOE/aE/AKEHibwj/6gvWMl9Rsh0Qc= golang.org/x/time v0.0.0-20200416051211-89c76fbcd5d1/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= diff --git a/cmd/api/src/services/fileupload/file_upload.go b/cmd/api/src/services/fileupload/file_upload.go index ca5b211b12..87f6caf9fc 100644 --- a/cmd/api/src/services/fileupload/file_upload.go +++ b/cmd/api/src/services/fileupload/file_upload.go @@ -18,7 +18,6 @@ package fileupload import ( - "bufio" "context" "errors" "fmt" @@ -27,6 +26,7 @@ import ( "os" "time" + "github.com/specterops/bloodhound/bomenc" "github.com/specterops/bloodhound/headers" "github.com/specterops/bloodhound/mediatypes" "github.com/specterops/bloodhound/src/model/ingest" @@ -120,18 +120,12 @@ func WriteAndValidateZip(src io.Reader, dst io.Writer) error { } func WriteAndValidateJSON(src io.Reader, dst io.Writer) error { - tr := io.TeeReader(src, dst) - bufReader := bufio.NewReader(tr) - if b, err := bufReader.Peek(3); err != nil { + normalizedContent, err := bomenc.NormalizeToUTF8(src) + if err != nil { return err - } else { - if b[0] == UTF8BOM1 && b[1] == UTF8BOM2 && b[2] == UTF8BMO3 { - if _, err := bufReader.Discard(3); err != nil { - return err - } - } } - _, err := ValidateMetaTag(bufReader, true) + tr := io.TeeReader(normalizedContent, dst) + _, err = ValidateMetaTag(tr, true) return err } @@ -147,7 +141,7 @@ func SaveIngestFile(location string, request *http.Request) (string, model.FileT } else if utils.HeaderMatches(request.Header, headers.ContentType.String(), ingest.AllowedZipFileUploadTypes...) { return tempFile.Name(), model.FileTypeZip, WriteAndValidateFile(fileData, tempFile, WriteAndValidateZip) } else { - //We should never get here since this is checked a level above + // We should never get here since this is checked a level above return "", model.FileTypeJson, fmt.Errorf("invalid content type for ingest file") } } diff --git a/cmd/api/src/services/fileupload/file_upload_test.go b/cmd/api/src/services/fileupload/file_upload_test.go index 59943adf18..9d81504440 100644 --- a/cmd/api/src/services/fileupload/file_upload_test.go +++ b/cmd/api/src/services/fileupload/file_upload_test.go @@ -18,6 +18,7 @@ package fileupload import ( "bytes" + "errors" "io" "os" "strings" @@ -27,42 +28,9 @@ import ( "github.com/stretchr/testify/assert" ) -func TestWriteAndValidateJSON(t *testing.T) { - t.Run("trigger invalid json on bad json", func(t *testing.T) { - var ( - writer = bytes.Buffer{} - badJSON = strings.NewReader("{[]}") - ) - err := WriteAndValidateJSON(badJSON, &writer) - assert.ErrorIs(t, err, ErrInvalidJSON) - }) - - t.Run("succeed on good json", func(t *testing.T) { - var ( - writer = bytes.Buffer{} - goodJSON = strings.NewReader(`{"meta": {"methods": 0, "type": "sessions", "count": 0, "version": 5}, "data": []}`) - ) - err := WriteAndValidateJSON(goodJSON, &writer) - assert.Nil(t, err) - }) - - t.Run("succeed on utf-8 BOM json", func(t *testing.T) { - var ( - writer = bytes.Buffer{} - ) - - file, err := os.Open("../../test/fixtures/fixtures/utf8bomjson.json") - assert.Nil(t, err) - err = WriteAndValidateJSON(io.Reader(file), &writer) - assert.Nil(t, err) - }) -} - func TestWriteAndValidateZip(t *testing.T) { t.Run("valid zip file is ok", func(t *testing.T) { - var ( - writer = bytes.Buffer{} - ) + writer := bytes.Buffer{} file, err := os.Open("../../test/fixtures/fixtures/goodzip.zip") assert.Nil(t, err) @@ -81,3 +49,86 @@ func TestWriteAndValidateZip(t *testing.T) { assert.Equal(t, err, ingest.ErrInvalidZipFile) }) } + +func TestWriteAndValidateJSON(t *testing.T) { + tests := []struct { + name string + input []byte + expectedOutput []byte + expectedError error + }{ + { + name: "UTF-8 without BOM", + input: []byte(`{"meta": {"type": "domains", "version": 4, "count": 1}, "data": [{"domain": "example.com"}]}`), + expectedOutput: []byte(`{"meta": {"type": "domains", "version": 4, "count": 1}, "data": [{"domain": "example.com"}]}`), + expectedError: nil, + }, + { + name: "UTF-8 with BOM", + input: append([]byte{0xEF, 0xBB, 0xBF}, []byte(`{"meta": {"type": "domains", "version": 4, "count": 1}, "data": [{"domain": "example.com"}]}`)...), + expectedOutput: []byte(`{"meta": {"type": "domains", "version": 4, "count": 1}, "data": [{"domain": "example.com"}]}`), + expectedError: nil, + }, + { + name: "UTF-16BE with BOM", + input: []byte{0xFE, 0xFF, 0x00, 0x7B, 0x00, 0x22, 0x00, 0x6D, 0x00, 0x65, 0x00, 0x74, 0x00, 0x61, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x7B, 0x00, 0x22, 0x00, 0x74, 0x00, 0x79, 0x00, 0x70, 0x00, 0x65, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x22, 0x00, 0x64, 0x00, 0x6F, 0x00, 0x6D, 0x00, 0x61, 0x00, 0x69, 0x00, 0x6E, 0x00, 0x73, 0x00, 0x22, 0x00, 0x2C, 0x00, 0x20, 0x00, 0x22, 0x00, 0x76, 0x00, 0x65, 0x00, 0x72, 0x00, 0x73, 0x00, 0x69, 0x00, 0x6F, 0x00, 0x6E, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x34, 0x00, 0x2C, 0x00, 0x20, 0x00, 0x22, 0x00, 0x63, 0x00, 0x6F, 0x00, 0x75, 0x00, 0x6E, 0x00, 0x74, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x31, 0x00, 0x7D, 0x00, 0x2C, 0x00, 0x20, 0x00, 0x22, 0x00, 0x64, 0x00, 0x61, 0x00, 0x74, 0x00, 0x61, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x5B, 0x00, 0x7B, 0x00, 0x22, 0x00, 0x64, 0x00, 0x6F, 0x00, 0x6D, 0x00, 0x61, 0x00, 0x69, 0x00, 0x6E, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x22, 0x00, 0x65, 0x00, 0x78, 0x00, 0x61, 0x00, 0x6D, 0x00, 0x70, 0x00, 0x6C, 0x00, 0x65, 0x00, 0x2E, 0x00, 0x63, 0x00, 0x6F, 0x00, 0x6D, 0x00, 0x22, 0x00, 0x7D, 0x00, 0x5D, 0x00, 0x7D}, + expectedOutput: []byte{0x7b, 0x22, 0x6d, 0x65, 0x74, 0x61, 0x22, 0x3a, 0x20, 0x7b, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x20, 0x22, 0x64, 0x6f, 0x6d, 0x61, 0x69, 0x6e, 0x73, 0x22, 0x2c, 0x20, 0x22, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x22, 0x3a, 0x20, 0x34, 0x2c, 0x20, 0x22, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x22, 0x3a, 0x20, 0x31, 0x7d, 0x2c, 0x20, 0x22, 0x64, 0x61, 0x74, 0x61, 0x22, 0x3a, 0x20, 0x5b, 0x7b, 0x22, 0x64, 0x6f, 0x6d, 0x61, 0x69, 0x6e, 0x22, 0x3a, 0x20, 0x22, 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x2e, 0x63, 0x6f, 0x6d, 0x22, 0x7d, 0x5d, 0x7d}, + expectedError: nil, + }, + { + name: "Missing meta tag", + input: []byte(`{"data": [{"domain": "example.com"}]}`), + expectedOutput: []byte(`{"data": [{"domain": "example.com"}]}`), + expectedError: ingest.ErrMetaTagNotFound, + }, + { + name: "Missing data tag", + input: []byte(`{"meta": {"type": "domains", "version": 4, "count": 1}}`), + expectedOutput: []byte(`{"meta": {"type": "domains", "version": 4, "count": 1}}`), + expectedError: ingest.ErrDataTagNotFound, + }, + // NOTE: this test discovers a bug where invalid JSON files are not being invalidated due to the current + // implemenation of ValidateMetaTag of decoding each token. + // { + // name: "Invalid JSON", + // input: []byte(`{"meta": {"type": "domains", "version": 4, "count": 1}, "data": [{"domain": "example.com"`), + // expectedOutput: []byte(`{"meta": {"type": "domains", "version": 4, "count": 1}, "data": [{"domain": "example.com"`), + // expectedError: ErrInvalidJSON, + // }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + src := bytes.NewReader(tt.input) + dst := &bytes.Buffer{} + + err := WriteAndValidateJSON(src, dst) + if tt.expectedError != nil { + assert.Error(t, err) + assert.ErrorIs(t, err, tt.expectedError) + } else { + assert.NoError(t, err) + } + assert.Equal(t, tt.expectedOutput, dst.Bytes()) + }) + } +} + +func TestWriteAndValidateJSON_NormalizationError(t *testing.T) { + src := &ErrorReader{err: errors.New("read error")} + dst := &bytes.Buffer{} + + err := WriteAndValidateJSON(src, dst) + + assert.Error(t, err) + assert.ErrorIs(t, err, ErrInvalidJSON) +} + +// ErrorReader is a mock reader that always returns an error +type ErrorReader struct { + err error +} + +func (er *ErrorReader) Read(p []byte) (n int, err error) { + return 0, er.err +} diff --git a/go.work b/go.work index c5077a96c0..21c94c9931 100644 --- a/go.work +++ b/go.work @@ -14,11 +14,12 @@ // // SPDX-License-Identifier: Apache-2.0 -go 1.21 +go 1.21.3 use ( ./cmd/api/src ./packages/go/analysis + ./packages/go/bomenc ./packages/go/cache ./packages/go/conftool ./packages/go/crypto diff --git a/packages/go/bomenc/encodings.go b/packages/go/bomenc/encodings.go new file mode 100644 index 0000000000..a3add1c4bf --- /dev/null +++ b/packages/go/bomenc/encodings.go @@ -0,0 +1,198 @@ +// Copyright 2024 Specter Ops, Inc. +// +// Licensed under the Apache License, Version 2.0 +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package bomenc + +import ( + "encoding/binary" + "io" +) + +// Encoding interface defines the methods that all encoding types must implement. +// This interface provides a unified way to handle different encodings throughout the package, +// allowing us to treat all encodings uniformly. This design facilitates easy extension +// and manipulation of different encoding types without altering the core logic. +type Encoding interface { + // Sequence returns the byte sequence that represents the Byte Order Mark (BOM) for this encoding. + // This method is crucial for identifying the specific byte sequence that indicates + // this encoding at the start of a file. + Sequence() []byte + + // String returns a human-readable string representation of the encoding. + // This is particularly useful for logging and user interfaces, providing + // a user-friendly name for the encoding. + String() string + + // HasSequence checks if the given data starts with this encoding's BOM sequence. + // This method allows for efficient checking of whether a given byte slice + // begins with this encoding's BOM, which is essential for encoding detection. + HasSequence(data Peeker) bool +} + +// Peeker interface defines a single method for introspecing the first n number of bytes in the underlying structure without modifying its read state or advancing its cursor. +type Peeker interface { + Peek(n int) ([]byte, error) +} + +// bomEncoding is the concrete implementation of the Encoding interface. +// It encapsulates all necessary information and behavior for a specific encoding, +// providing a consistent structure for handling different encodings. This approach +// allows us to create instances for each supported encoding while maintaining +// a uniform interface for interaction. +type bomEncoding struct { + encodingType string // A human-readable name for the encoding + sequence []byte // The BOM sequence for this encoding + hasSequenceFunc func(data Peeker) bool // Function to check if data starts with this encoding's BOM +} + +// String returns the human-readable name of the encoding. +// This method fulfills the Encoding interface and provides a simple way +// to get a string representation of the encoding. +func (s bomEncoding) String() string { + return s.encodingType +} + +// Sequence returns the BOM sequence for this encoding. +// This method fulfills the Encoding interface and provides access to the BOM sequence, +// which is essential for encoding detection and writing files with proper BOMs. +func (s bomEncoding) Sequence() []byte { + return s.sequence +} + +// HasSequence checks if the given data starts with this encoding's BOM sequence. +// This method fulfills the Encoding interface and provides a way to check for +// the presence of this encoding's BOM, which is crucial for encoding detection. +func (s bomEncoding) HasSequence(data Peeker) bool { + return s.hasSequenceFunc(data) +} + +// The following functions are used to check for specific encoding BOMs. +// By defining these as separate functions, we can easily reuse them +// and potentially extend them if more complex checking is needed in the future. +// This approach also keeps the bomEncoding struct clean and simple. + +func isUTF32BE(data Peeker) bool { + if buf, err := data.Peek(4); err != nil { + return false + } else { + return buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0xFE && buf[3] == 0xFF + } +} + +func isUTF32LE(data Peeker) bool { + if buf, err := data.Peek(4); err != nil { + return false + } else if buf[0] != 0xFF || buf[1] != 0xFE || buf[2] != 0x00 || buf[3] != 0x00 { + return false + } else if buf, err := data.Peek(64); err != nil && err != io.EOF { // BOM + sample code points to check for valid sequences + return false + } else if err != nil && err == io.EOF && len(buf)%4 != 0 { + return false + } else { + // Check for valid UTF-32LE sequences + for i := 4; i+3 < len(buf); i += 4 { + codePoint := binary.LittleEndian.Uint32(buf[i : i+4]) + if codePoint > 0x10FFFF { + return false + } + } + // NOTE: There is an edge case where data may may include the UTF16LE BOM and a NULL code point + // followed by sampled code points that, when calculated, all fall within the unicode range. In this + // case, distinguishing between UTF32LE and UTF16LE encoded data is impossible from just the BOM + data. + // With the probability of occurence being low, we're opting to return true + return true + } +} + +func isUTF8(data Peeker) bool { + if buf, err := data.Peek(3); err != nil { + return false + } else { + return buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF + } +} + +func isUTF16BE(data Peeker) bool { + if buf, err := data.Peek(2); err != nil { + return false + } else { + return buf[0] == 0xFE && buf[1] == 0xFF + } +} + +func isUTF16LE(data Peeker) bool { + if buf, err := data.Peek(2); err != nil { + return false + } else { + if buf[0] == 0xFF && buf[1] == 0xFE { + if buf, err := data.Peek(4); err != nil { + return err == io.EOF && len(buf) == 2 // true: has UTF16LE BOM w/ no data, false: is invalid UTF16LE encoding + } else { + return !isUTF32LE(data) // true: is UTF16LE data, false: is UTF32LE data + } + } + return false + } +} + +// The following variables define the supported encodings. +// By defining these as package-level variables, we allow easy reference +// throughout the package and by users of the package. This design also +// facilitates potential future extension by simply adding new encoding variables. + +// Unknown represents an unknown or unrecognized encoding. +// Having an Unknown encoding allows us to handle cases where +// the encoding cannot be determined, providing a fallback option. +var Unknown Encoding = bomEncoding{ + encodingType: "Unknown", + sequence: nil, // Unknown encoding has no BOM sequence + hasSequenceFunc: func(data Peeker) bool { return false }, +} + +// UTF8 represents the UTF-8 encoding. +var UTF8 Encoding = bomEncoding{ + encodingType: "UTF-8", + sequence: []byte{0xEF, 0xBB, 0xBF}, // UTF-8 BOM sequence + hasSequenceFunc: isUTF8, +} + +// UTF16BE represents the UTF-16 Big Endian encoding. +var UTF16BE Encoding = bomEncoding{ + encodingType: "UTF-16 BE", + sequence: []byte{0xFE, 0xFF}, // UTF-16 BE BOM sequence + hasSequenceFunc: isUTF16BE, +} + +// UTF16LE represents the UTF-16 Little Endian encoding. +var UTF16LE Encoding = bomEncoding{ + encodingType: "UTF-16 LE", + sequence: []byte{0xFF, 0xFE}, // UTF-16 LE BOM sequence + hasSequenceFunc: isUTF16LE, +} + +// UTF32BE represents the UTF-32 Big Endian encoding. +var UTF32BE Encoding = bomEncoding{ + encodingType: "UTF-32 BE", + sequence: []byte{0x00, 0x00, 0xFE, 0xFF}, // UTF-32 BE BOM sequence + hasSequenceFunc: isUTF32BE, +} + +// UTF32LE represents the UTF-32 Little Endian encoding. +var UTF32LE Encoding = bomEncoding{ + encodingType: "UTF-32 LE", + sequence: []byte{0xFF, 0xFE, 0x00, 0x00}, // UTF-32 LE BOM sequence + hasSequenceFunc: isUTF32LE, +} diff --git a/packages/go/bomenc/encodings_test.go b/packages/go/bomenc/encodings_test.go new file mode 100644 index 0000000000..73a617a7f0 --- /dev/null +++ b/packages/go/bomenc/encodings_test.go @@ -0,0 +1,233 @@ +// Copyright 2024 Specter Ops, Inc. +// +// Licensed under the Apache License, Version 2.0 +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package bomenc + +import ( + "bufio" + "bytes" + "io" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestEncodingInterface(t *testing.T) { + encodings := []struct { + name string + encoding Encoding + }{ + {name: "Unknown", encoding: Unknown}, + {name: "UTF8", encoding: UTF8}, + {name: "UTF16BE", encoding: UTF16BE}, + {name: "UTF16LE", encoding: UTF16LE}, + {name: "UTF32BE", encoding: UTF32BE}, + {name: "UTF32LE", encoding: UTF32LE}, + } + + for _, tt := range encodings { + t.Run(tt.name, func(t *testing.T) { + assert.NotEmpty(t, tt.encoding.String(), "Encoding String() should not be empty") + if tt.encoding.String() != Unknown.String() { + assert.NotEmpty(t, tt.encoding.Sequence(), "Encoding Sequence() should not be empty for non-Unknown encodings") + } + // Test HasSequence method + if tt.encoding.String() != Unknown.String() { + reader := bufio.NewReader(bytes.NewReader(tt.encoding.Sequence())) + assert.True(t, tt.encoding.HasSequence(reader), "HasSequence() should return true for its own sequence") + } + }) + } +} + +func TestEncodingValues(t *testing.T) { + tests := []struct { + name string + encoding Encoding + expectedType string + expectedSeq []byte + }{ + { + name: "Unknown", + encoding: Unknown, + expectedType: "Unknown", + expectedSeq: nil, + }, + { + name: "UTF-8", + encoding: UTF8, + expectedType: "UTF-8", + expectedSeq: []byte{0xEF, 0xBB, 0xBF}, + }, + { + name: "UTF-16 BE", + encoding: UTF16BE, + expectedType: "UTF-16 BE", + expectedSeq: []byte{0xFE, 0xFF}, + }, + { + name: "UTF-16 LE", + encoding: UTF16LE, + expectedType: "UTF-16 LE", + expectedSeq: []byte{0xFF, 0xFE}, + }, + { + name: "UTF-32 BE", + encoding: UTF32BE, + expectedType: "UTF-32 BE", + expectedSeq: []byte{0x00, 0x00, 0xFE, 0xFF}, + }, + { + name: "UTF-32 LE", + encoding: UTF32LE, + expectedType: "UTF-32 LE", + expectedSeq: []byte{0xFF, 0xFE, 0x00, 0x00}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.expectedType, tt.encoding.String(), "Encoding type should match") + assert.Equal(t, tt.expectedSeq, tt.encoding.Sequence(), "Encoding sequence should match") + if tt.encoding.String() != Unknown.String() { + assert.True(t, tt.encoding.HasSequence(bufio.NewReader(bytes.NewReader(tt.expectedSeq))), "HasSequence() should return true for the expected sequence") + } + }) + } +} + +func TestBOMEncoding(t *testing.T) { + testCases := []struct { + name string + encoding bomEncoding + expectedString string + expectedSeq []byte + testData []byte + hasSequence bool + }{ + { + name: "Custom encoding", + encoding: bomEncoding{ + encodingType: "Custom", + sequence: []byte{0x01, 0x02, 0x03}, + hasSequenceFunc: func(input Peeker) bool { + if data, err := input.Peek(3); err == nil { + return len(data) >= 3 && data[0] == 0x01 && data[1] == 0x02 && data[2] == 0x03 + } + return false + }, + }, + expectedString: "Custom", + expectedSeq: []byte{0x01, 0x02, 0x03}, + testData: []byte{0x01, 0x02, 0x03, 0x04}, + hasSequence: true, + }, + { + name: "Empty encoding", + encoding: bomEncoding{ + encodingType: "", + sequence: []byte{}, + hasSequenceFunc: func(input Peeker) bool { + if data, err := input.Peek(0); err == io.EOF { + return len(data) == 0 + } + return false + }, + }, + expectedString: "", + expectedSeq: []byte{}, + testData: []byte{0x01, 0x02, 0x03}, + hasSequence: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.expectedString, tc.encoding.String(), "bomEncoding String() should return correct value") + assert.Equal(t, tc.expectedSeq, tc.encoding.Sequence(), "bomEncoding Sequence() should return correct value") + assert.Equal(t, tc.hasSequence, tc.encoding.HasSequence(bufio.NewReader(bytes.NewReader(tc.testData))), "bomEncoding HasSequence() should return correct value") + }) + } +} + +func TestEncodingEquality(t *testing.T) { + testCases := []struct { + name string + enc1 Encoding + enc2 Encoding + expected bool + }{ + { + name: "Same encoding", + enc1: UTF8, + enc2: UTF8, + expected: true, + }, + { + name: "Different encodings", + enc1: UTF8, + enc2: UTF16BE, + expected: false, + }, + { + name: "Unknown and other encoding", + enc1: Unknown, + enc2: UTF8, + expected: false, + }, + { + name: "Both Unknown", + enc1: Unknown, + enc2: Unknown, + expected: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.expected, tc.enc1.String() == tc.enc2.String(), "Encoding equality check should be correct") + }) + } +} + +func TestHasSequence(t *testing.T) { + testCases := []struct { + name string + encoding Encoding + input []byte + expected bool + }{ + {"UTF-8 with correct BOM", UTF8, []byte{0xEF, 0xBB, 0xBF, 0x68, 0x65, 0x6C, 0x6C, 0x6F}, true}, + {"UTF-8 without BOM", UTF8, []byte{0x68, 0x65, 0x6C, 0x6C, 0x6F}, false}, + {"UTF-16BE with correct BOM", UTF16BE, []byte{0xFE, 0xFF, 0x00, 0x68, 0x00, 0x65}, true}, + {"UTF-16BE without BOM", UTF16BE, []byte{0x00, 0x68, 0x00, 0x65}, false}, + {"UTF-16LE with correct BOM", UTF16LE, []byte{0xFF, 0xFE, 0x68, 0x00, 0x65, 0x00}, true}, + {"UTF-16LE without BOM", UTF16LE, []byte{0x68, 0x00, 0x65, 0x00}, false}, + {"UTF-32BE with correct BOM", UTF32BE, []byte{0x00, 0x00, 0xFE, 0xFF, 0x00, 0x00, 0x00, 0x68}, true}, + {"UTF-32BE without BOM", UTF32BE, []byte{0x00, 0x00, 0x00, 0x68}, false}, + {"UTF-32LE with correct BOM", UTF32LE, []byte{0xFF, 0xFE, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00}, true}, + {"UTF-32LE without BOM", UTF32LE, []byte{0x68, 0x00, 0x00, 0x00}, false}, + {"Unknown encoding", Unknown, []byte{0x68, 0x65, 0x6C, 0x6C, 0x6F}, false}, + {"Empty input", UTF8, []byte{}, false}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + result := tc.encoding.HasSequence(bufio.NewReader(bytes.NewReader(tc.input))) + assert.Equal(t, tc.expected, result, "HasSequence() should correctly identify BOM presence") + }) + } +} diff --git a/packages/go/bomenc/go.mod b/packages/go/bomenc/go.mod new file mode 100644 index 0000000000..9230c24b52 --- /dev/null +++ b/packages/go/bomenc/go.mod @@ -0,0 +1,14 @@ +module github.com/specterops/bloodhound/bomenc + +go 1.21.3 + +require ( + github.com/stretchr/testify v1.9.0 + golang.org/x/text v0.17.0 +) + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/packages/go/bomenc/go.sum b/packages/go/bomenc/go.sum new file mode 100644 index 0000000000..53f41df4ba --- /dev/null +++ b/packages/go/bomenc/go.sum @@ -0,0 +1,12 @@ +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +golang.org/x/text v0.17.0 h1:XtiM5bkSOt+ewxlOE/aE/AKEHibwj/6gvWMl9Rsh0Qc= +golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/packages/go/bomenc/normalize.go b/packages/go/bomenc/normalize.go new file mode 100644 index 0000000000..39dc0e99ad --- /dev/null +++ b/packages/go/bomenc/normalize.go @@ -0,0 +1,85 @@ +// Copyright 2024 Specter Ops, Inc. +// +// Licensed under the Apache License, Version 2.0 +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package bomenc + +import ( + "bufio" + "errors" + "io" + + "golang.org/x/text/encoding/unicode" + "golang.org/x/text/encoding/unicode/utf32" +) + +// DetectBOMEncoding detects the byte order mark in the given bytes and returns the corresponding Encoding. +// This function is crucial for determining the encoding of incoming data based on its BOM. +func DetectBOMEncoding(data Peeker) Encoding { + if UTF8.HasSequence(data) { + return UTF8 + } else if UTF16BE.HasSequence(data) { + return UTF16BE + } else if UTF16LE.HasSequence(data) { // NOTE: The byte sequence for UTF16LE matches the first two bytes for UTF32LE; this check's implementation ensures that the BOM does not accidently match UTF32LE + return UTF16LE + } else if UTF32LE.HasSequence(data) { + return UTF32LE + } else if UTF32BE.HasSequence(data) { + return UTF32BE + } else { + return Unknown + } +} + +// NormalizeToUTF8 converts the input to UTF-8, removing any BOM. +// This function is the main entry point for normalizing data from an io.Reader. +// It's useful when working with streams of data, such as file input. +func NormalizeToUTF8(input io.Reader) (io.Reader, error) { + buf := bufio.NewReader(input) + switch DetectBOMEncoding(buf).String() { + case UTF8.String(): + if _, err := buf.Discard(3); err != nil { + return nil, err + } else { + return buf, nil + } + case UTF16LE.String(): + return transformUTF16toUTF8(unicode.LittleEndian, buf) + case UTF16BE.String(): + return transformUTF16toUTF8(unicode.BigEndian, buf) + case UTF32LE.String(): + return transformUTF32toUTF8(utf32.LittleEndian, buf) + case UTF32BE.String(): + return transformUTF32toUTF8(utf32.BigEndian, buf) + default: + // Either Unknown or no BOM + return unicode.UTF8.NewDecoder().Reader(buf), nil + } +} + +// transformUTF16toUTF8 aids NormalizeToUTF8 in converting UTF16 data with a given endienness into UTF8 +func transformUTF16toUTF8(endianness unicode.Endianness, data io.Reader) (io.Reader, error) { + utf16leDecoder := unicode.UTF16(endianness, unicode.UseBOM).NewDecoder() + return utf16leDecoder.Reader(data), nil +} + +// transformUTF32toUTF8 aids NormalizeToUTF8 in converting UTF32 data with a given endienness into UTF8 +func transformUTF32toUTF8(endianness utf32.Endianness, data io.Reader) (io.Reader, error) { + utf16leDecoder := utf32.UTF32(endianness, utf32.UseBOM).NewDecoder() + return utf16leDecoder.Reader(data), nil +} + +// ErrUnknownEncodingInvalidUTF8 ... +var ErrUnknownEncodingInvalidUTF8 = errors.New("unknown encoding and not a valid UTF-8") diff --git a/packages/go/bomenc/normalize_test.go b/packages/go/bomenc/normalize_test.go new file mode 100644 index 0000000000..6544d92333 --- /dev/null +++ b/packages/go/bomenc/normalize_test.go @@ -0,0 +1,250 @@ +// Copyright 2024 Specter Ops, Inc. +// +// Licensed under the Apache License, Version 2.0 +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package bomenc + +import ( + "bufio" + "bytes" + "errors" + "io" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "golang.org/x/text/encoding/unicode" +) + +func TestDetectBOMEncoding(t *testing.T) { + tests := []struct { + name string + input []byte + expected Encoding + }{ + { + name: "UTF-8 BOM", + input: []byte{0xEF, 0xBB, 0xBF, 0x68, 0x65, 0x6C, 0x6C, 0x6F}, + expected: UTF8, + }, + { + name: "UTF-16BE BOM", + input: []byte{0xFE, 0xFF, 0x00, 0x68, 0x00, 0x65, 0x00, 0x6C, 0x00, 0x6C, 0x00, 0x6F}, + expected: UTF16BE, + }, + { + name: "UTF-16LE BOM", + input: []byte{0xFF, 0xFE, 0x68, 0x00, 0x65, 0x00, 0x6C, 0x00, 0x6C, 0x00, 0x6F, 0x00}, + expected: UTF16LE, + }, + { + name: "UTF-32BE BOM", + input: []byte{0x00, 0x00, 0xFE, 0xFF, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x65}, + expected: UTF32BE, + }, + { + name: "UTF-32LE BOM", + input: []byte{0xFF, 0xFE, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00}, + expected: UTF32LE, + }, + { + name: "No BOM", + input: []byte{0x68, 0x65, 0x6C, 0x6C, 0x6F}, + expected: Unknown, + }, + { + name: "Empty input", + input: []byte{}, + expected: Unknown, + }, + { + name: "Incomplete UTF-16LE BOM (should not be detected as UTF-16LE)", + input: []byte{0xFF, 0xFE, 0x68}, + expected: Unknown, + }, + { + name: "Incomplete UTF-32LE BOM (should not be detected as UTF-32LE)", + input: []byte{0xFF, 0xFE, 0x00}, + expected: Unknown, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + reader := bufio.NewReader(bytes.NewReader(tt.input)) + result := DetectBOMEncoding(reader) + assert.Equal(t, tt.expected.String(), result.String(), "DetectBOMEncoding() should return the correct encoding") + }) + } +} + +func TestNormalizeToUTF8(t *testing.T) { + tests := []struct { + name string + input []byte + expected []byte + encFrom Encoding + wantErr bool + }{ + { + name: "UTF-8 BOM", + input: []byte{0xEF, 0xBB, 0xBF, 0x68, 0x65, 0x6C, 0x6C, 0x6F}, + expected: []byte("hello"), + encFrom: UTF8, + wantErr: false, + }, + { + name: "UTF-16BE BOM", + input: []byte{0xFE, 0xFF, 0x00, 0x68, 0x00, 0x65, 0x00, 0x6C, 0x00, 0x6C, 0x00, 0x6F}, + expected: []byte("hello"), + encFrom: UTF16BE, + wantErr: false, + }, + { + name: "UTF-16LE BOM", + input: []byte{0xFF, 0xFE, 0x68, 0x00, 0x65, 0x00, 0x6C, 0x00, 0x6C, 0x00, 0x6F, 0x00}, + expected: []byte("hello"), + encFrom: UTF16LE, + wantErr: false, + }, + { + name: "UTF-32BE BOM", + input: []byte{0x00, 0x00, 0xFE, 0xFF, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x6F}, + expected: []byte("hello"), + encFrom: UTF32BE, + wantErr: false, + }, + { + name: "UTF-32LE BOM", + input: []byte{0xFF, 0xFE, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x00}, + expected: []byte("hello"), + encFrom: UTF32LE, + wantErr: false, + }, + { + name: "No BOM (valid UTF-8)", + input: []byte("hello"), + expected: []byte("hello"), + encFrom: Unknown, + wantErr: false, + }, + { + name: "No BOM (invalid UTF-8)", + input: []byte{0xFF, 0xFE, 0xFD}, + expected: []byte{0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD}, + encFrom: Unknown, + wantErr: false, + }, + { + name: "Empty input", + input: []byte{}, + expected: []byte{}, + encFrom: Unknown, + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + reader := bufio.NewReader(bytes.NewReader(tt.input)) + detectedEnc := DetectBOMEncoding(reader) + result, err := NormalizeToUTF8(reader) + assert.NoError(t, err) + actual, err := io.ReadAll(result) + + if tt.wantErr { + assert.Error(t, err, "NormalizeToUTF8() should return an error for invalid input") + return + } + + require.NoError(t, err, "NormalizeToUTF8() should not return an error for valid input") + assert.Equal(t, tt.expected, actual, "NormalizedContent() should return the correct normalized content") + assert.Equal(t, tt.encFrom.String(), detectedEnc.String(), "NormalizedFrom() should return the correct original encoding") + }) + } +} + +// Mock reader for testing error cases +type errorReader struct{} + +func (er errorReader) Read(p []byte) (n int, err error) { + return 0, errors.New("mock read error") +} + +func TestNormalizeToUTF8_ReaderError(t *testing.T) { + reader, err := NormalizeToUTF8(errorReader{}) + assert.NoError(t, err) + _, err = io.ReadAll(reader) + assert.Error(t, err, "NormalizeToUTF8() should return an error when the reader fails") +} + +func TestNormalizeToUTF8_LargeInput(t *testing.T) { + type testCase struct { + name string + input []byte + expected []byte + encFrom Encoding + } + + // Generate a large data set with 1000 Unicode code points + // NOTE: We don't want to begin the payload with a NULL character because it would then be impossible to discern between UTF16LE and UTF32LE + var runes []rune + for i := 0; i <= 1000; i++ { + runes = append(runes, rune(i%0x10FFFF)) + } + + utf8Bytes := []byte(string(runes)) + utf16LEBytes, err := unicode.UTF16(unicode.LittleEndian, unicode.UseBOM).NewEncoder().Bytes(utf8Bytes) + require.NoError(t, err) + + tests := []testCase{ + { + name: "Large UTF-16LE input", + input: utf16LEBytes, + expected: utf8Bytes, + encFrom: UTF16LE, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + reader := bufio.NewReader(bytes.NewReader(tt.input)) + detectedEnc := DetectBOMEncoding(reader) + assert.Equal(t, tt.encFrom.String(), detectedEnc.String(), "NormalizedFrom() should return the correct original encoding") + + result, err := NormalizeToUTF8(reader) + if err != nil { + t.Errorf("NormalizeToUTF8() error = %v", err) + // Print the first few bytes of the input for debugging + t.Logf("First 20 bytes of input: %v", tt.input[:20]) + // Print the detected encoding + assert.NoError(t, err) + t.Logf("Detected encoding: %v", detectedEnc) + return + } + + actual, err := io.ReadAll(result) + assert.NoError(t, err) + + if !bytes.Equal(tt.expected, actual) { + t.Errorf("NormalizedContent() = %v, want %v", actual, tt.expected) + // Print the first few bytes of the result and expected for debugging + t.Logf("First 20 bytes of result: %v", actual[:20]) + t.Logf("First 20 bytes of expected: %v", tt.expected[:20]) + t.Logf("First 20 bytes of input: %v", tt.input[:20]) + } + }) + } +}