Converting Strings to Hex

Denton · Aug 10, 2012#12012-08-10T03:17+00:00

I've recently been shown the power of scripts in the macro forum, and it's made me excited about streamlining a translation workflow. However, scripting seems way beyond my comprehension!

For a translation script, would UE be able to; measure a string length and insert at start, convert a string to hex, based on previous string, if shorter, pad with null until original length, if longer than previous string, ignore? e.g.

Input:

Find "レイヤー"
Replace "Layer"
Find "フォルダー"
Replace "Folder"
Find "ペン"
Replace "Airbrush"
Find "ファイル"
Replace "File"

Output:

Find "0C E3 83 AC E3 82 A4 E3 83 A4 E3 83 BC"
Replace "05 4C 61 79 65 72 00 00 00 00 00 00 00"
Find "0F E3 83 95 E3 82 A9 E3 83 AB E3 83 80 E3 83 BC"
Replace "06 46 6F 6C 64 65 72 00 00 00 00 00 00 00 00 00"
Find "06 E3 83 9A E3 83 B3"
Replace "Airbrush"
Find "0C E3 83 95 E3 82 A1 E3 82 A4 E3 83 AB"
Replace "04 46 69 6C 65 00 00 00 00 00 00 00 00"

Mofi · Aug 10, 2012#22012-08-10T06:16+00:00

That is surely possible, but not so easy as you want the hexadecimal values for the characters encoded in UTF-8. The Javascript String object has just member function charCodeAt() which returns the Unicode (UTF-16) value for a character. So we need to find in WWW a Javascript function which converts the UTF-16 value of a Unicode character to the UTF-8 byte sequence. UTF-8 is an encoding used only for storing data in files, not in memory of applications. UTF-8 encoded characters are converted by all applications supporting UTF-8 to UTF-16 on loading the data from file. I will look on this task on weekend if somebody else is not faster than I and offers a solution already before. Perhaps you can search in the meantime for a Javascript function for converting UTF-16 (Unicode) values to UTF-8 byte sequences in WWW and post the link to source of this function.

Denton · Aug 10, 2012#32012-08-10T07:37+00:00

The more I read about Unicode the more I'm overwhelmed!
I'm not sure if these will be of use to you:
http://www.onicos.com/staff/iz/amuse/ja ... rt/utf.txt
http://homepage3.nifty.com/aokura/jscript/utf8.html
http://jsfromhell.com/geral/utf-8 (this is ANSI to UTF-8)

Mofi · Aug 11, 2012#42012-08-11T12:43+00:00

You found wonderful functions. It was no problem to write the script code for your requirement with the function to convert a UTF-16 string to a UTF-8 string. I have written the script for using the CSV file with the strings as shown below as input file as this file is according to already deleted topic "Find and Replace (from external file?)" the real input file.

Code: Select all

"ファイル";"File"
"レイヤー";"Layer"
"ペン";"Airbrush"
"フォルダー";"Folder"

The script below does all you requested here and in the other topic. So it is already the final script.

Code: Select all

function utf16to8(str) {
   /* Copyright (C) 1999 Masanao Izumo <[email protected]>
    * Version: 1.0
    * LastModified: Dec 25 1999
    * This library is free.  You can redistribute it and/or modify it.
    * http://www.onicos.com/staff/iz/amuse/javascript/expert/utf.txt */
   var out, i, len, c;

   out = "";
   len = str.length;
   for(i = 0; i < len; i++) {
      c = str.charCodeAt(i);
      if ((c >= 0x0001) && (c <= 0x007F)) {
         out += str.charAt(i);
      } else if (c > 0x07FF) {
         out += String.fromCharCode(0xE0 | ((c >> 12) & 0x0F));
         out += String.fromCharCode(0x80 | ((c >>  6) & 0x3F));
         out += String.fromCharCode(0x80 | ((c >>  0) & 0x3F));
      } else {
         out += String.fromCharCode(0xC0 | ((c >>  6) & 0x1F));
         out += String.fromCharCode(0x80 | ((c >>  0) & 0x3F));
      }
   }
   return out;
}

if (UltraEdit.document.length > 1)  // At least 2 files opened in UltraEdit?
{
   // Define environment for script.
   UltraEdit.ueReOn();              // Use UltraEdit regular expression search engine.
   UltraEdit.insertMode();          // Turn on insert mode.
   if (typeof(UltraEdit.columnModeOff) == "function") UltraEdit.columnModeOff();    // Turn off column mode.
   else if (typeof(UltraEdit.activeDocument.columnModeOff) == "function") UltraEdit.activeDocument.columnModeOff();

   // Determine which file to edit and which file contains the strings by
   // finding out which file is opened in hex edit mode: first or second file.
   var FileToEdit = UltraEdit.document[0];
   var FileWithStrings = UltraEdit.document[1];
   if (UltraEdit.document[0].hexMode == false)
   {
      FileToEdit = UltraEdit.document[1];
      FileWithStrings = UltraEdit.document[0];
   }

   FileWithStrings.selectAll();  // Select entire content of active file.
   UltraEdit.selectClipboard(9); // Copy selection to user clipboard 9 as selection property
   FileWithStrings.copy();       // of UltraEdit document does not support Unicode strings.

   // Get just the strings in double quotes and the DOS line terminators into a large Unicode
   // string whereby every semicolon in the CSV file is replaced by a DOS line termination too.
   var sUnicodeStrings = UltraEdit.clipboardContent.replace(/\"(.+?)\";\"(.+?)\"/g,"$1\r\n$2");

   // Clear user clipboard 9 and cancel selection by moving caret to top of input file.
   UltraEdit.clearClipboard();
   UltraEdit.selectClipboard(0);
   FileWithStrings.top();
   // Split this large Unicode string into an array of Unicode strings on every DOS line termination.
   var asSearchReplace = sUnicodeStrings.split("\r\n");
   // Remove the last string if it is an empty string because file ended with a line termination.
   if (asSearchReplace[asSearchReplace.length-1] == "") asSearchReplace.pop();

   // The array of Unicode strings contains search and replace strings which should be encoded in
   // UTF-8 and the encoded characters converted to ASCII strings with the hexadecimal numbers of
   // the UTF-8 characters. Define arrays to hold in memory those search and replace strings.
   var asHexSearchStrings = new Array();
   var asHexReplaceStrings = new Array();

   // In the following loop all search/replace Unicode strings are converted to ASCII strings with the
   // hexadecimal values of the UTF-8 encoded strings. The space between the hexadecimal values is not
   // really neded for the replaces and therefore omitted for faster script execution and less memory
   // usage.
   for (var nStringIndex = 0; nStringIndex < asSearchReplace.length; nStringIndex++)
   {
      // Convert next search string from UTF-16 LE to UTF-8 with converting first \n in string to 0D 0A.
      var sUtf8Search = utf16to8(asSearchReplace[nStringIndex++].replace(/\\n/g,"\r\n"));
      // Convert next replace string from UTF-16 LE to UTF-8 with converting first \n in string to 0D 0A.
      var sUtf8Replace = utf16to8(asSearchReplace[nStringIndex].replace(/\\n/g,"\r\n"));

      // If UTF-8 encoded replace string is longer than UTF-8 encoded search string, ignore those strings.
      if (sUtf8Replace.length > sUtf8Search.length) continue;

      // Convert number of UTF-8 characters in search string to ASCII string with hexadecimal value.
      var sStringLength = sUtf8Search.length.toString(16);
      // If the length of this string is odd, an additional leading zero is needed.
      var sNextSearch = (sStringLength.length % 2) ? "0" : "";
      sNextSearch += sStringLength;

      // Convert the UTF-8 search string to ASCII string with hexadecimal values of the bytes.
      for (var nByteIndex = 0; nByteIndex < sUtf8Search.length; nByteIndex++)
      {
         var nCharValue = sUtf8Search.charCodeAt(nByteIndex);
         if (nCharValue < 16) sNextSearch += "0";
         sNextSearch += nCharValue.toString(16);
      }

      // The same as above must be done also for the replace string.
      sStringLength = sUtf8Replace.length.toString(16);
      var sNextReplace = (sStringLength.length % 2) ? "0" : "";
      sNextReplace += sStringLength;
      for (nByteIndex = 0; nByteIndex < sUtf8Replace.length; nByteIndex++)
      {
         nCharValue = sUtf8Replace.charCodeAt(nByteIndex);
         if (nCharValue < 16) sNextReplace += "0";
         sNextReplace += nCharValue.toString(16);
      }

      // Append '0' to replace string if shorter than search string.
      for (nByteIndex = sNextReplace.length; nByteIndex < sNextSearch.length; nByteIndex++)
      {
         sNextReplace += "0";
      }
      asHexSearchStrings.push(sNextSearch.toUpperCase());
      asHexReplaceStrings.push(sNextReplace.toUpperCase());
   }

   // Make the replaces in hex edit mode on file to edit using the strings from the other file.
   FileToEdit.top();
   FileToEdit.findReplace.mode=0;
   FileToEdit.findReplace.searchAscii=false;
   FileToEdit.findReplace.matchCase=false;
   FileToEdit.findReplace.regExp=false;
   FileToEdit.findReplace.searchDown=true;
   FileToEdit.findReplace.replaceAll=false;
   for (nStringIndex = 0; nStringIndex < asHexSearchStrings.length; nStringIndex++)
   {
      FileToEdit.findReplace.replace(asHexSearchStrings[nStringIndex],asHexReplaceStrings[nStringIndex]);
   }
}

Denton · Aug 11, 2012#52012-08-11T16:48+00:00

It is working almost perfectly, except that it seems to fails on line breaks.

The Find/Replace format would be:

Code: Select all

Find "いくつかの例
とテキスト
改行"
Replace "Some example
text with a
line break"

The old CSV format (which was harder to edit and check for formatting errors) is:

Code: Select all

"いくつかの例\nとテキスト\n改行";"Some example\ntext with a\nline break"

It looks as if the script doesn't convert linebreak \n into 0D 0A so fails to find/replace it in the hex?

I thought I should explain how we were using the hex conversion.
We were using a small program to convert UTF-8 to hex.
Here is the link: http://www.mediafire.com/?302mst24074bxdc
It uses the Find/Replace setup.
Paste this into the upper section and hit 'To' to see how it works:

Code: Select all

Find "これが収まる"
Replace "This will fit"

Find "これは適合しません"
Replace "This will not fit because the string is too long"

It helped to see if a string would fit (we could highlight "unicode"/non-hex text in a huge file) or look for alternate words (currently there's English, Chinese and Korean community translations in development), however the application has a crippling bug which omits nulls from some strings if they are larger than 92 bytes.
So that's why we thought it would be great if it were possible to output UTF-8 to hex via script.

Two scripts would be fine, it doesn't need to be combined (if that's easier?). And again, we're all extremely thankful for your time.
Sorry, I hope that made it clearer!

Mofi · Aug 12, 2012#62012-08-12T10:17+00:00

I changed in the script I posted before

Code: Select all

      // Convert next search string from UTF-16 LE to UTF-8.
      var sUtf8Search = utf16to8(asSearchReplace[nStringIndex++]);
      // Convert next replace string from UTF-16 LE to UTF-8.
      var sUtf8Replace = utf16to8(asSearchReplace[nStringIndex]);

to

Code: Select all

      // Convert next search string from UTF-16 LE to UTF-8 with converting first \n in string to 0D 0A.
      var sUtf8Search = utf16to8(asSearchReplace[nStringIndex++].replace(/\\n/g,"\r\n"));
      // Convert next replace string from UTF-16 LE to UTF-8 with converting first \n in string to 0D 0A.
      var sUtf8Replace = utf16to8(asSearchReplace[nStringIndex].replace(/\\n/g,"\r\n"));

to fulfill the new requirement too.

It would be no problem to split the script up into 2 scripts if you want to see the strings with the hexadecimal values stored into a file before running the second part making the replaces. It would be also no problem to report which input strings are ignored because the translated string is longer as the string to translate.

Denton · Aug 12, 2012#72012-08-12T18:18+00:00

Awesome, the new script works really well with the line breaks! It's working correctly with test files, but failing with the real files (this is might be a formatting error on my part).
Also, the Chinese translator is commenting that the script isn't correctly processing some larger strings...
So having the script split up into 2 scripts would be an excellent way to see what's getting converted correctly and what's getting skipped.

Mofi · Aug 13, 2012#82012-08-13T08:31+00:00

Okay, here are the two scripts.

The first script runs on active file, if the file has not the file extension JS. If the active file has file extension JS because it is the script file itself, it runs on first file (most left file on file tabs bar), except the script file is the first file, in which case the script runs on second file. The script produces two new files. The first new file contains the strings which are ignored or just the message that no string was ignored. The second new file contains the UTF-8 encoded strings with their hexadecimal values as needed for the second script.

Code: Select all

function utf16to8(str) {
   /* Copyright (C) 1999 Masanao Izumo <[email protected]>
    * Version: 1.0
    * LastModified: Dec 25 1999
    * This library is free.  You can redistribute it and/or modify it.
    * http://www.onicos.com/staff/iz/amuse/javascript/expert/utf.txt */
   var out, i, len, c;

   out = "";
   len = str.length;
   for(i = 0; i < len; i++) {
      c = str.charCodeAt(i);
      if ((c >= 0x0001) && (c <= 0x007F)) {
         out += str.charAt(i);
      } else if (c > 0x07FF) {
         out += String.fromCharCode(0xE0 | ((c >> 12) & 0x0F));
         out += String.fromCharCode(0x80 | ((c >>  6) & 0x3F));
         out += String.fromCharCode(0x80 | ((c >>  0) & 0x3F));
      } else {
         out += String.fromCharCode(0xC0 | ((c >>  6) & 0x1F));
         out += String.fromCharCode(0x80 | ((c >>  0) & 0x3F));
      }
   }
   return out;
}

if (UltraEdit.document.length > 0)  // At least 1 file opened in UltraEdit?
{
   // Define environment for script.
   UltraEdit.ueReOn();              // Use UltraEdit search engine.
   UltraEdit.insertMode();          // Turn on insert mode.
   if (typeof(UltraEdit.columnModeOff) == "function") UltraEdit.columnModeOff();    // Turn off column mode.
   else if (typeof(UltraEdit.activeDocument.columnModeOff) == "function") UltraEdit.activeDocument.columnModeOff();

   // Get document index number of active file.
   var nCsvFileIndex = UltraEdit.activeDocumentIdx;
   // Has the active file the file extension JS?
   if (UltraEdit.activeDocument.isExt("js"))
   {
      // Yes, the active file is most likely this script file.
      // The CSV file is therefore either the first file if this
      // is not the active file, or otherwise the second file.
      nCsvFileIndex = !nCsvFileIndex ? 1 : 0;
   }
   var CsvFile = UltraEdit.document[nCsvFileIndex];

   CsvFile.selectAll();             // Select entire content of active file.
   UltraEdit.selectClipboard(9);    // Copy selection to user clipboard 9 as selection property
   CsvFile.copy();                  // of UltraEdit document does not support Unicode strings.

   // Get just the strings in double quotes and the DOS line terminators into a large Unicode
   // string whereby every semicolon in the CSV file is replaced by a DOS line termination too.
   var sUnicodeStrings = UltraEdit.clipboardContent.replace(/\"(.+?)\";\"(.+?)\"/g,"$1\r\n$2");

   // Clear user clipboard 9 and cancel selection by moving caret to top of input file.
   UltraEdit.clearClipboard();
   CsvFile.top();
   // Split this large Unicode string into an array of Unicode strings on every DOS line termination.
   var asSearchReplace = sUnicodeStrings.split("\r\n");
   // Remove the last string if it is an empty string because file ended with a line termination.
   if (asSearchReplace[asSearchReplace.length-1] == "") asSearchReplace.pop();

   // The array of Unicode strings contains search and replace strings which should be encoded in
   // UTF-8 and the encoded characters converted to ASCII strings with the hexadecimal numbers of
   // the UTF-8 characters. Define arrays to hold in memory those search and replace strings.
   var asHexSearchStrings = new Array();
   var asHexReplaceStrings = new Array();
   var anIgnoredStrings = new Array();

   // In the following loop all search/replace Unicode strings are converted to ASCII strings with the
   // hexadecimal values of the UTF-8 encoded strings. The space between the hexadecimal values is not
   // really neded for the replaces and therefore omitted for faster script execution and less memory
   // usage.
   for (var nStringIndex = 0; nStringIndex < asSearchReplace.length; nStringIndex++)
   {
      // Convert next search string from UTF-16 LE to UTF-8 with converting first \n in string to 0D 0A.
      var sUtf8Search = utf16to8(asSearchReplace[nStringIndex++].replace(/\\n/g,"\r\n"));
      // Convert next replace string from UTF-16 LE to UTF-8 with converting first \n in string to 0D 0A.
      var sUtf8Replace = utf16to8(asSearchReplace[nStringIndex].replace(/\\n/g,"\r\n"));

      // If UTF-8 encoded replace string is longer than UTF-8 encoded search string, ignore those strings.
      if (sUtf8Replace.length > sUtf8Search.length)
      {
         // Remember line number with the strings ignored.
         anIgnoredStrings.push((nStringIndex+1)/2);
         continue;
      }

      // Convert number of UTF-8 characters in search string to ASCII string with hexadecimal value.
      var sStringLength = sUtf8Search.length.toString(16);
      // If the length of this string is odd, an additional leading zero is needed.
      var sNextSearch = (sStringLength.length % 2) ? "0" : "";
      sNextSearch += sStringLength;

      // Convert the UTF-8 search string to ASCII string with hexadecimal values of the bytes.
      for (var nByteIndex = 0; nByteIndex < sUtf8Search.length; nByteIndex++)
      {
         var nCharValue = sUtf8Search.charCodeAt(nByteIndex);
         if (nCharValue < 16) sNextSearch += "0";
         sNextSearch += nCharValue.toString(16);
      }

      // The same as above must be done also for the replace string.
      sStringLength = sUtf8Replace.length.toString(16);
      var sNextReplace = (sStringLength.length % 2) ? "0" : "";
      sNextReplace += sStringLength;
      for (nByteIndex = 0; nByteIndex < sUtf8Replace.length; nByteIndex++)
      {
         nCharValue = sUtf8Replace.charCodeAt(nByteIndex);
         if (nCharValue < 16) sNextReplace += "0";
         sNextReplace += nCharValue.toString(16);
      }

      // Append '0' to replace string if shorter than search string.
      for (nByteIndex = sNextReplace.length; nByteIndex < sNextSearch.length; nByteIndex++)
      {
         sNextReplace += "0";
      }
      asHexSearchStrings.push(sNextSearch.toUpperCase());
      asHexReplaceStrings.push(sNextReplace.toUpperCase());
   }

   // Create report for the strings ignored because of length condition.
   if (anIgnoredStrings.length == 0)
   {
      UltraEdit.newFile();
      UltraEdit.activeDocument.write("No strings ignored because of right string longer than left string.");
   }
   else
   {
      var_dump(anIgnoredStrings);
      UltraEdit.newFile();
      UltraEdit.activeDocument.unixMacToDos();
      UltraEdit.activeDocument.ASCIIToUnicode();
      UltraEdit.activeDocument.write("Following strings ignored because of right string longer than left string:\r\n\r\n");
      for (nStringIndex = 0; nStringIndex < anIgnoredStrings.length; nStringIndex++)
      {
         CsvFile.gotoLine(anIgnoredStrings[nStringIndex],1);
         CsvFile.selectLine();
         CsvFile.copyAppend();
      }
      CsvFile.top();
      UltraEdit.activeDocument.paste();
      UltraEdit.clearClipboard();
   }
   UltraEdit.activeDocument.top();

   // Create output file with the UTF-8 encoded strings in hexadecimal notation.
   for (nStringIndex = 0; nStringIndex < asHexSearchStrings.length; nStringIndex++)
   {
      UltraEdit.clipboardContent += "F: ";
      UltraEdit.clipboardContent += asHexSearchStrings[nStringIndex];
      UltraEdit.clipboardContent += "\r\nR: ";
      UltraEdit.clipboardContent += asHexReplaceStrings[nStringIndex];
      UltraEdit.clipboardContent += "\r\n";
   }
   UltraEdit.newFile();
   UltraEdit.activeDocument.unixMacToDos();
   UltraEdit.activeDocument.unicodeToASCII();
   UltraEdit.activeDocument.paste();
   UltraEdit.clearClipboard();
   UltraEdit.selectClipboard(0);
}

The second script runs on first two opened files. One file must be the file produced by first script. The other file must be opened in hex edit mode on which the hexadecimal replaces are executed.

Code: Select all

if (UltraEdit.document.length > 1)  // At least 2 files opened in UltraEdit?
{
   // Define environment for script.
   UltraEdit.ueReOn();              // Use UltraEdit search engine.
   UltraEdit.insertMode();          // Turn on insert mode.
   if (typeof(UltraEdit.columnModeOff) == "function") UltraEdit.columnModeOff();    // Turn off column mode.
   else if (typeof(UltraEdit.activeDocument.columnModeOff) == "function") UltraEdit.activeDocument.columnModeOff();

   // Determine which file to edit and which file contains the strings by
   // finding out which file is opened in hex edit mode: first or second file.
   var FileToEdit = UltraEdit.document[0];
   var FileWithStrings = UltraEdit.document[1];
   if (UltraEdit.document[0].hexMode == false)
   {
      FileToEdit = UltraEdit.document[1];
      FileWithStrings = UltraEdit.document[0];
   }
   FileWithStrings.selectAll();  // Select entire content of active file.

   // Get just the hexadecimal values the DOS line terminators into a large ASCII string.
   var sAllStrings = FileWithStrings.selection.replace(/F: (.+?)\r\nR: (.+?)/g,"$1\r\n$2");
   FileWithStrings.top();        // Cancel selection by moving caret to top of file.
   // Split the large string into an array of string which one string per line.
   var asSearchReplaceStrings = sAllStrings.split("\r\n");
   asSearchReplaceStrings.pop();  // Remove empty string from end of array.

   // Make the replaces in hex edit mode on file to edit using the strings from the other file.
   FileToEdit.top();
   FileToEdit.findReplace.mode=0;
   FileToEdit.findReplace.searchAscii=false;
   FileToEdit.findReplace.matchCase=false;
   FileToEdit.findReplace.regExp=false;
   FileToEdit.findReplace.searchDown=true;
   FileToEdit.findReplace.replaceAll=false;
   for (nStringIndex = 0; nStringIndex < asSearchReplaceStrings.length; nStringIndex++)
   {
      FileToEdit.findReplace.replace(asSearchReplaceStrings[nStringIndex],asSearchReplaceStrings[++nStringIndex]);
   }
}