-
Notifications
You must be signed in to change notification settings - Fork 76
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Audit the encoding column (with values "big5", "cns") in BPMFBase.txt #497
Comments
Please remind me if the encoding column is for "Ctrl + ` Key: Input Big 5 Code" and if so, how to test it. 🙏 |
I guess a simple script can do that. // Run: `swift inspect.swift` under command line`
import CoreFoundation
import Foundation
func getCharCode(string: String, encoding: UInt32) -> String {
return string.map { c in
let swiftString = "\(c)"
let cfString: CFString = swiftString as CFString
var cStringBuffer = [CChar](repeating: 0, count: 4)
CFStringGetCString(cfString, &cStringBuffer, 4, encoding)
let data = Data(bytes: cStringBuffer, count: strlen(cStringBuffer))
if data.count >= 2 {
return "0x" + String(format: "%02x%02x", data[0], data[1]).uppercased()
}
return "N/A"
}.joined(separator: " ")
}
let kCFStringEncodingBig5 = UInt32(0x0A03)
let kCFStringEncodingBig5_HKSCS_1999 = UInt32(0x0A06)
let kCFStringEncodingCNS_11643_92_P3 = UInt32(0x0653)
func main() throws {
let path = "../BPMFBase.txt"
let url = URL(fileURLWithPath: path)
let text = try String(contentsOf: url, encoding: .utf8)
let components = text.components(separatedBy: "\n")
for line in components {
let parts = line.components(separatedBy: " ")
if parts.count != 5 {
print(line)
continue
}
let word = parts[0]
let category = parts[4]
let big5Code = getCharCode(string: word, encoding: kCFStringEncodingBig5)
let big5HKSCScode = getCharCode(string: word, encoding: kCFStringEncodingBig5_HKSCS_1999)
let cnsCode = getCharCode(string: word, encoding: kCFStringEncodingCNS_11643_92_P3)
if category == "big5" {
if big5Code == "N/A" && big5HKSCScode == "N/A" {
print("\(word) is not in big5 and big5 HKSCS")
}
} else if category == "cns" {
if cnsCode != "N/A" {
print("\(word) is not CNS")
}
if big5Code != "N/A" {
print("\(word) can be big5")
} else if big5HKSCScode != "N/A" {
print("\(word) can be big5 HKSCS")
}
} else if category == "utf8" {
if big5Code != "N/A" {
print("\(word) can be big5")
} else if big5HKSCScode != "N/A" {
print("\(word) can be big5 HKSCS")
} else if cnsCode != "N/A" {
print("\(word) can be CNS")
}
}
}
}
try? main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
See #491 (comment) for details. There are entries that are off. We may want to audit its use and decide whether to rectity those entries, or simply remove the column from the data and the scripts altogether.
The text was updated successfully, but these errors were encountered: