Skip to content

fix: character issues with umlauts #471

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions src/TibiaDataUtils.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,6 @@ func TibiaDataQueryEscapeString(data string) string {
// switching "+" to " "
data = strings.ReplaceAll(data, "+", " ")

// encoding string to latin-1
data, _ = TibiaDataConvertEncodingtoISO88591(data)

// returning with QueryEscape function
return url.QueryEscape(data)
}
Expand Down
5 changes: 4 additions & 1 deletion src/TibiaDataUtils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,16 +150,19 @@ func TestEscaper(t *testing.T) {
strOne = "god durin"
strTwo = "god+durin"
strThree = "gód"
strFour = "Näurin"
)

sanitizedStrOne := TibiaDataQueryEscapeString(strOne)
sanitizedStrTwo := TibiaDataQueryEscapeString(strTwo)
sanitizedStrThree := TibiaDataQueryEscapeString(strThree)
sanitizedStrFour := TibiaDataQueryEscapeString(strFour)

assert := assert.New(t)
assert.Equal(sanitizedStrOne, "god+durin")
assert.Equal(sanitizedStrTwo, "god+durin")
assert.Equal(sanitizedStrThree, "g%F3d")
assert.Equal(sanitizedStrThree, "g%C3%B3d")
assert.Equal(sanitizedStrFour, "N%C3%A4urin")
}

func TestDateParser(t *testing.T) {
Expand Down
7 changes: 6 additions & 1 deletion src/TibiaWorldsWorld.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package main

import (
"fmt"
"golang.org/x/net/html/charset"
"net/http"
"regexp"
"strings"
Expand Down Expand Up @@ -53,8 +54,12 @@ var (
func TibiaWorldsWorldImpl(world string, BoxContentHTML string, url string) (WorldResponse, error) {
// TODO: We need to read the world name from the response rather than pass it into this func

utf8Reader, err := charset.NewReader(strings.NewReader(BoxContentHTML), "")
if err != nil {
return WorldResponse{}, fmt.Errorf("[error] TibiaWorldsWorldImpl failed at charset.NewReader, err: %s", err)
}
// Loading HTML data into ReaderHTML for goquery with NewReader
ReaderHTML, err := goquery.NewDocumentFromReader(strings.NewReader(BoxContentHTML))
ReaderHTML, err := goquery.NewDocumentFromReader(utf8Reader)
if err != nil {
return WorldResponse{}, fmt.Errorf("[error] TibiaWorldsWorldImpl failed at goquery.NewDocumentFromReader, err: %s", err)
}
Expand Down
34 changes: 34 additions & 0 deletions src/TibiaWorldsWorld_test.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,47 @@
package main

import (
"golang.org/x/text/encoding/charmap"
"io"
"testing"

"github.com/TibiaData/tibiadata-api-go/src/static"
"github.com/stretchr/testify/assert"
)

func TestWorldAntica_UmlautRecovery(t *testing.T) {
// Load the UTF‑8 HTML fixture
f, err := static.TestFiles.Open("testdata/worlds/world/Antica.html")
if err != nil {
t.Fatalf("opening test file: %v", err)
}
defer f.Close()

utf8Data, err := io.ReadAll(f)
if err != nil {
t.Fatalf("reading test file: %v", err)
}

// Re‑encode to ISO‑8859‑1 to simulate the real‑world mis‑encoding - this is what we receive from as encoding
isoBytes, err := charmap.ISO8859_1.NewEncoder().Bytes(utf8Data)
if err != nil {
t.Fatalf("re‑encoding to ISO‑8859‑1 failed: %v", err)
}

// check function TibiaWorldsWorldImpl
resp, err := TibiaWorldsWorldImpl("Antica", string(isoBytes), "")
if err != nil {
t.Fatalf("parser error: %v", err)
}
got := resp.World.OnlinePlayers[0].Name

// It must *not* be the mojibake sequence
assert.NotEqual(t, "Näurin", got, "should not return mojibake")

// It must be the correct Umlaut
assert.Equal(t, "Näurin", got, "should correctly decode Umlaut from input")
}

func TestWorldEndebra(t *testing.T) {
file, err := static.TestFiles.Open("testdata/worlds/world/Endebra.html")
if err != nil {
Expand Down
788 changes: 788 additions & 0 deletions src/static/testdata/worlds/world/Antica.html

Large diffs are not rendered by default.

19 changes: 11 additions & 8 deletions src/webserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"encoding/json"
"errors"
"fmt"
"golang.org/x/net/html/charset"
"log"
"net/http"
"os"
Expand Down Expand Up @@ -1289,14 +1290,16 @@ func TibiaDataHTMLDataCollector(TibiaDataRequest TibiaDataRequestStruct) (string
return string(res.Body()), nil
}

// Convert body to io.Reader
resIo := bytes.NewReader(res.Body())

// wrap reader in a converting reader from ISO 8859-1 to UTF-8
resIo2 := TibiaDataConvertEncodingtoUTF8(resIo)

// Load the HTML document
doc, err := goquery.NewDocumentFromReader(resIo2)
// Decode the raw response into real UTF-8 using the Content‑Type header
utf8Reader, err := charset.NewReader(
bytes.NewReader(res.Body()),
res.Header().Get("Content-Type"),
)
if err != nil {
log.Printf("[error] TibiaDataHTMLDataCollector charset.NewReader failed: %s", err)
return "", err
}
doc, err := goquery.NewDocumentFromReader(utf8Reader)
if err != nil {
log.Printf("[error] TibiaDataHTMLDataCollector (URL: %s) error: %s", res.Request.URL, err)
return "", err
Expand Down
Loading