Fixing broken text encodings in Xojo
That is a German text saved as UTF-8, than imported in another encoding. We need to cleanup that encoding and I got a nice function for this. We simply try to see if the text in another encoding is valid text and convert it back to UTF-8. Here is the function and it's test method:Vielen Dank für ihr Verständnis.
Function FixEncoding(s as string) As string
dim UTF8 as TextEncoding = encodings.UTF8
if not UTF8.IsValidData(s) then
break // this function works only on UTF-8 text!
return s
end if
// how many ? do we have?
dim CountFieldsS as Integer = CountFieldsB(s, "?")
dim u as integer = encodings.Count-1
for i as integer = 0 to u
// we try each encoding
dim e as TextEncoding = encodings.Item(i)
dim internetName as string = e.internetName
if internetName.left(3) <> "UTF" then // skip all UTF variants
// now convert back to old encoding
dim t as string = ConvertEncoding(s, e)
if UTF8.IsValidData(t) then
// looks like the new text is valid UTF-8
dim CountFieldsT as integer = CountFieldsB(t, "?")
if CountFieldsT = CountFieldsS then
// and conversion didn't find characters it didn't like
dim r as string = DefineEncoding(t, encodings.UTF8)
// we got a fixed string
return r
end if
end if
end if
next
End Function
Sub Test()
dim SomeText as string = "Vielen Dank für ihr Verständnis."
// lets say you would save this text in other encoding
dim SomeText_WindowsANSI as string = DefineEncoding(SomeText, encodings.WindowsANSI)
dim SomeText_MacRoman as string = DefineEncoding(SomeText, encodings.MacRoman)
dim SomeText_DOSLatin1 as string = DefineEncoding(SomeText, encodings.DOSLatin1)
// and now get that as UTF-8
dim SomeText_WindowsANSI_UTF8 as string = ConvertEncoding(SomeText_WindowsANSI, Encodings.UTF8)
dim SomeText_MacRoman_UTF8 as string = ConvertEncoding(SomeText_MacRoman, Encodings.UTF8)
dim SomeText_DOSLatin1_UTF8 as string = ConvertEncoding(SomeText_DOSLatin1, Encodings.UTF8)
// now you have the broken text you don't want.
// e.g. Vielen Dank für ihr Verständnis.
// now we want to fix.
dim SomeText_WindowsANSI_UTF8_fixed as string = FixEncoding(SomeText_WindowsANSI_UTF8)
dim SomeText_MacRoman_UTF8_fixed as string = FixEncoding(SomeText_MacRoman_UTF8)
dim SomeText_DOSLatin1_UTF8_fixed as string = FixEncoding(SomeText_DOSLatin1_UTF8)
// and you see in debugger all texts are back right
Break
End Sub