Tuesday, April 25, 2017

Java - iterate over a UTF-8 string

Either read the file as UTF-8 or convert it later.


public static void main(String[] args) throws Exception {

String path = "D:\\test.txt";
FileInputStream stream = new FileInputStream(new File(path));
BufferedReader br = new BufferedReader(new InputStreamReader(stream,"UTF-8"));
String str;
while ((str = br.readLine()) != null) {
System.out.println(str.length());
for(int i = 0; i< str.length(); ++i) {
System.out.println(String.format("%04x",(int)str.charAt(i)));
}
}
}

OR

BufferedReader br = new BufferedReader(new InputStreamReader(stream));
String str;
while ((str = br.readLine()) != null) {
byte[] ptext = str.getBytes(ISO_8859_1);
str = new String(ptext, UTF_8);
System.out.println(str.length());
for(int i = 0; i< str.length(); ++i) {
System.out.println(String.format("%04x",(int)str.charAt(i)));
}
}

No comments:

Blog Archive