* Fix chardet test and add ordering option Signed-off-by: Andrew Thornton <art27@cantab.net> * minor fixes Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log2 Signed-off-by: Andrew Thornton <art27@cantab.net> * only iterate through top results Signed-off-by: Andrew Thornton <art27@cantab.net> * Update docs/content/doc/advanced/config-cheat-sheet.en-us.md * slight restructure of for loop Signed-off-by: Andrew Thornton <art27@cantab.net> Co-authored-by: techknowlogick <techknowlogick@gitea.io>tags/v1.13.0-rc1
@@ -46,7 +46,8 @@ Values containing `#` or `;` must be quoted using `` ` `` or `"""`. | |||||
an absolute path. | an absolute path. | ||||
- `SCRIPT_TYPE`: **bash**: The script type this server supports. Usually this is `bash`, | - `SCRIPT_TYPE`: **bash**: The script type this server supports. Usually this is `bash`, | ||||
but some users report that only `sh` is available. | but some users report that only `sh` is available. | ||||
- `ANSI_CHARSET`: **\<empty\>**: The default charset for an unrecognized charset. | |||||
- `DETECTED_CHARSETS_ORDER`: **UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, UTF-32LE, ISO-8859, windows-1252, ISO-8859, windows-1250, ISO-8859, ISO-8859, ISO-8859, windows-1253, ISO-8859, windows-1255, ISO-8859, windows-1251, windows-1256, KOI8-R, ISO-8859, windows-1254, Shift_JIS, GB18030, EUC-JP, EUC-KR, Big5, ISO-2022, ISO-2022, ISO-2022, IBM424_rtl, IBM424_ltr, IBM420_rtl, IBM420_ltr**: Tie-break order of detected charsets - if the detected charsets have equal confidence, charsets earlier in the list will be chosen in preference to those later. Adding `defaults` will place the unnamed charsets at that point. | |||||
- `ANSI_CHARSET`: **\<empty\>**: Default ANSI charset to override non-UTF-8 charsets to. | |||||
- `FORCE_PRIVATE`: **false**: Force every new repository to be private. | - `FORCE_PRIVATE`: **false**: Force every new repository to be private. | ||||
- `DEFAULT_PRIVATE`: **last**: Default private when creating a new repository. | - `DEFAULT_PRIVATE`: **last**: Default private when creating a new repository. | ||||
\[last, private, public\] | \[last, private, public\] | ||||
@@ -7,6 +7,7 @@ package charset | |||||
import ( | import ( | ||||
"bytes" | "bytes" | ||||
"fmt" | "fmt" | ||||
"strings" | |||||
"unicode/utf8" | "unicode/utf8" | ||||
"code.gitea.io/gitea/modules/log" | "code.gitea.io/gitea/modules/log" | ||||
@@ -137,16 +138,42 @@ func DetectEncoding(content []byte) (string, error) { | |||||
} else { | } else { | ||||
detectContent = content | detectContent = content | ||||
} | } | ||||
result, err := textDetector.DetectBest(detectContent) | |||||
// Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie break | |||||
results, err := textDetector.DetectAll(detectContent) | |||||
if err != nil { | if err != nil { | ||||
if err == chardet.NotDetectedError && len(setting.Repository.AnsiCharset) > 0 { | |||||
log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset) | |||||
return setting.Repository.AnsiCharset, nil | |||||
} | |||||
return "", err | return "", err | ||||
} | } | ||||
topConfidence := results[0].Confidence | |||||
topResult := results[0] | |||||
priority, has := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(topResult.Charset))] | |||||
for _, result := range results { | |||||
// As results are sorted in confidence order - if we have a different confidence | |||||
// we know it's less than the current confidence and can break out of the loop early | |||||
if result.Confidence != topConfidence { | |||||
break | |||||
} | |||||
// Otherwise check if this results is earlier in the DetectedCharsetOrder than our current top guesss | |||||
resultPriority, resultHas := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(result.Charset))] | |||||
if resultHas && (!has || resultPriority < priority) { | |||||
topResult = result | |||||
priority = resultPriority | |||||
has = true | |||||
} | |||||
} | |||||
// FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument | // FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument | ||||
if result.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 { | |||||
if topResult.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 { | |||||
log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset) | log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset) | ||||
return setting.Repository.AnsiCharset, err | return setting.Repository.AnsiCharset, err | ||||
} | } | ||||
log.Debug("Detected encoding: %s", result.Charset) | |||||
return result.Charset, err | |||||
log.Debug("Detected encoding: %s", topResult.Charset) | |||||
return topResult.Charset, err | |||||
} | } |
@@ -230,7 +230,11 @@ func TestDetectEncoding(t *testing.T) { | |||||
// we accept either. | // we accept either. | ||||
assert.Contains(t, encoding, "ISO-8859") | assert.Contains(t, encoding, "ISO-8859") | ||||
old := setting.Repository.AnsiCharset | |||||
setting.Repository.AnsiCharset = "placeholder" | setting.Repository.AnsiCharset = "placeholder" | ||||
defer func() { | |||||
setting.Repository.AnsiCharset = old | |||||
}() | |||||
testSuccess(b, "placeholder") | testSuccess(b, "placeholder") | ||||
// invalid bytes | // invalid bytes | ||||
@@ -24,6 +24,8 @@ const ( | |||||
// Repository settings | // Repository settings | ||||
var ( | var ( | ||||
Repository = struct { | Repository = struct { | ||||
DetectedCharsetsOrder []string | |||||
DetectedCharsetScore map[string]int `ini:"-"` | |||||
AnsiCharset string | AnsiCharset string | ||||
ForcePrivate bool | ForcePrivate bool | ||||
DefaultPrivate string | DefaultPrivate string | ||||
@@ -88,6 +90,42 @@ var ( | |||||
Wiki []string | Wiki []string | ||||
} `ini:"repository.signing"` | } `ini:"repository.signing"` | ||||
}{ | }{ | ||||
DetectedCharsetsOrder: []string{ | |||||
"UTF-8", | |||||
"UTF-16BE", | |||||
"UTF-16LE", | |||||
"UTF-32BE", | |||||
"UTF-32LE", | |||||
"ISO-8859-1", | |||||
"windows-1252", | |||||
"ISO-8859-2", | |||||
"windows-1250", | |||||
"ISO-8859-5", | |||||
"ISO-8859-6", | |||||
"ISO-8859-7", | |||||
"windows-1253", | |||||
"ISO-8859-8-I", | |||||
"windows-1255", | |||||
"ISO-8859-8", | |||||
"windows-1251", | |||||
"windows-1256", | |||||
"KOI8-R", | |||||
"ISO-8859-9", | |||||
"windows-1254", | |||||
"Shift_JIS", | |||||
"GB18030", | |||||
"EUC-JP", | |||||
"EUC-KR", | |||||
"Big5", | |||||
"ISO-2022-JP", | |||||
"ISO-2022-KR", | |||||
"ISO-2022-CN", | |||||
"IBM424_rtl", | |||||
"IBM424_ltr", | |||||
"IBM420_rtl", | |||||
"IBM420_ltr", | |||||
}, | |||||
DetectedCharsetScore: map[string]int{}, | |||||
AnsiCharset: "", | AnsiCharset: "", | ||||
ForcePrivate: false, | ForcePrivate: false, | ||||
DefaultPrivate: RepoCreatingLastUserVisibility, | DefaultPrivate: RepoCreatingLastUserVisibility, | ||||
@@ -208,6 +246,10 @@ func newRepository() { | |||||
} else { | } else { | ||||
RepoRootPath = filepath.Clean(RepoRootPath) | RepoRootPath = filepath.Clean(RepoRootPath) | ||||
} | } | ||||
defaultDetectedCharsetsOrder := make([]string, 0, len(Repository.DetectedCharsetsOrder)) | |||||
for _, charset := range Repository.DetectedCharsetsOrder { | |||||
defaultDetectedCharsetsOrder = append(defaultDetectedCharsetsOrder, strings.ToLower(strings.TrimSpace(charset))) | |||||
} | |||||
ScriptType = sec.Key("SCRIPT_TYPE").MustString("bash") | ScriptType = sec.Key("SCRIPT_TYPE").MustString("bash") | ||||
if err = Cfg.Section("repository").MapTo(&Repository); err != nil { | if err = Cfg.Section("repository").MapTo(&Repository); err != nil { | ||||
@@ -222,6 +264,38 @@ func newRepository() { | |||||
log.Fatal("Failed to map Repository.PullRequest settings: %v", err) | log.Fatal("Failed to map Repository.PullRequest settings: %v", err) | ||||
} | } | ||||
preferred := make([]string, 0, len(Repository.DetectedCharsetsOrder)) | |||||
for _, charset := range Repository.DetectedCharsetsOrder { | |||||
canonicalCharset := strings.ToLower(strings.TrimSpace(charset)) | |||||
preferred = append(preferred, canonicalCharset) | |||||
// remove it from the defaults | |||||
for i, charset := range defaultDetectedCharsetsOrder { | |||||
if charset == canonicalCharset { | |||||
defaultDetectedCharsetsOrder = append(defaultDetectedCharsetsOrder[:i], defaultDetectedCharsetsOrder[i+1:]...) | |||||
break | |||||
} | |||||
} | |||||
} | |||||
i := 0 | |||||
for _, charset := range preferred { | |||||
// Add the defaults | |||||
if charset == "defaults" { | |||||
for _, charset := range defaultDetectedCharsetsOrder { | |||||
canonicalCharset := strings.ToLower(strings.TrimSpace(charset)) | |||||
if _, has := Repository.DetectedCharsetScore[canonicalCharset]; !has { | |||||
Repository.DetectedCharsetScore[canonicalCharset] = i | |||||
i++ | |||||
} | |||||
} | |||||
continue | |||||
} | |||||
if _, has := Repository.DetectedCharsetScore[charset]; !has { | |||||
Repository.DetectedCharsetScore[charset] = i | |||||
i++ | |||||
} | |||||
} | |||||
if !filepath.IsAbs(Repository.Upload.TempPath) { | if !filepath.IsAbs(Repository.Upload.TempPath) { | ||||
Repository.Upload.TempPath = path.Join(AppWorkPath, Repository.Upload.TempPath) | Repository.Upload.TempPath = path.Join(AppWorkPath, Repository.Upload.TempPath) | ||||
} | } | ||||