Script to find, remove and report repeating words within a line

Script to find, remove and report repeating words within a line

12
Basic UserBasic User
12

    Feb 14, 2012#1

    Hi all!

    I have text file in which many words are repeated. Please help me write a script do this work:
    1. A new file has no repeating word. Included should be the words appearing only once and words appearing more than once appears only once.
    2. A new file containing all the repeating words. But it should contain only words that appear more than once.
    :D Sorry for my English.

    An example which makes it probably easier to understand:

    The text file like this:
    Line 1: a b c d d e
    Line 2: a b b c c d e e
    Line 3: a b c d e e

    1. New file like this:
    Line 1: a b c d e
    Line 2: a b c d e
    Line 3: a b c d e

    2. New file like this:
    Line 1: d
    Line 2: b c e
    Line 3: e

    Please help me, thanks.

    6,681583
    Grand MasterGrand Master
    6,681583

      Feb 15, 2012#2

      This script was an easy task for me. It is not 100% clear if just the neighbour of a word can be a duplicate or one word can exist several times on a line. Therefore I have coded the script below to compare every word on a line against all other words on the line instead of just the next word. If just the neighbour can be a duplicate, the script produces the same result, but runs more compares than really necessary and is therefore slower.

      Code: Select all

      if (UltraEdit.document.length > 0)
      {
         // Define the environment for the script.
         UltraEdit.insertMode();
         if (typeof(UltraEdit.columnModeOff) == "function") UltraEdit.columnModeOff();
         else if (typeof(UltraEdit.activeDocument.columnModeOff) == "function") UltraEdit.activeDocument.columnModeOff();
         UltraEdit.activeDocument.hexOff();
         // Select all and load the file contents into an array of lines.
         UltraEdit.activeDocument.selectAll();
         if (UltraEdit.activeDocument.isSel())
         {
            var asLines = UltraEdit.activeDocument.selection.split("\r\n");
            UltraEdit.activeDocument.top();  // Discards the selection.
      
            var sDuplicates = "";  // Results string with the duplicate words.
            var bNewLine = false;  // Defines appending a new line on results string.
      
            // Search in every line for duplicate words which are separated
            // by a single space, remove all duplicate words within a line and
            // create a string with the duplicate words in every line.
            for (var nLineNum = 0; nLineNum < asLines.length; nLineNum++)
            {
               if (!asLines[nLineNum].length) continue;  // Ignore empty lines.
      
               // Split the line up into an array of word strings.
               var asWords = asLines[nLineNum].split(" ");
               var bDuplicateFound = false;
      
               // Run a case sensitive word comparison from first to last word in line.
               for (var nActWord = 0; nActWord < asWords.length; nActWord++)
               {
                  var bNewWord = true;
                  // Words left to current word must not be compared anymore.
                  var nCmpWord = nActWord + 1;
                  while (nCmpWord < asWords.length)
                  {
                     // Compare current word with all words right in the line.
                     if (asWords[nActWord] == asWords[nCmpWord])
                     {
                        // Remove the duplicate word from the array.
                        asWords.splice(nCmpWord,1);
                        // Report every duplicate word only once!
                        if (!bNewWord) continue;
                        bNewWord = false;
                        bDuplicateFound = true;
                        // Append the word to the string with the duplicate words.
                        if (!bNewLine)  // Append a space?
                        {               // Yes, but not on empty duplicates string.
                           if (sDuplicates.length) sDuplicates += ' ';
                        }
                        else            // Append a new line.
                        {
                           bNewLine = false;
                           sDuplicates += "\r\n";
                        }
                        // Append the word to the duplicates string.
                        sDuplicates += asWords[nActWord];
                     }
                     else nCmpWord++;   // Continue with next word in the line.
                  }
               }
               if (bDuplicateFound)     // Any duplicate word found in the line?
               {
                  // Join the words together to a line without the duplicate words.
                  asLines[nLineNum] = asWords.join(' ');
                  bNewLine = true;
               }
            }
            if (sDuplicates.length == 0)  // No duplicate word found?
            {
               UltraEdit.messageBox("No duplicate word found on any line!");
            }
            else
            {
               // Output lines without duplicate words into a new file.
               UltraEdit.newFile();
               UltraEdit.activeDocument.unixMacToDos();
               var sAllLines = asLines.join("\r\n");
               // Append a line termination if source file does not end with a line termination.
               if (asLines[asLines.length-1] != "") sAllLines += "\r\n";
               UltraEdit.activeDocument.write(sAllLines);
               UltraEdit.activeDocument.top();
               // Output the lines with the duplicate words found into a new file.
               sDuplicates += "\r\n";
               UltraEdit.newFile();
               UltraEdit.activeDocument.unixMacToDos();
               UltraEdit.activeDocument.write(sDuplicates);
               UltraEdit.activeDocument.top();
            }
         }
      }

      21
      Basic UserBasic User
      21

        Feb 15, 2012#3

        Todo less comparisons you can use the following code:

        Code: Select all

        if (UltraEdit.document.length > 0)
        {
         // Define the environment for the script.
         UltraEdit.insertMode();
         if (typeof(UltraEdit.columnModeOff) == "function") UltraEdit.columnModeOff();
         else if (typeof(UltraEdit.activeDocument.columnModeOff) == "function") UltraEdit.activeDocument.columnModeOff();
         UltraEdit.activeDocument.hexOff();
         // Select all and load the file contents into an array of lines.
         UltraEdit.activeDocument.selectAll();
         if (UltraEdit.activeDocument.isSel())
         {
         var asLines = UltraEdit.activeDocument.selection.split("\r\n");
         UltraEdit.activeDocument.top();  // Discards the selection.
        
         var sSingle = "";
         var sMultiple = "";
         // Search in every line for duplicate words which are separated
         // by a single space, remove all duplicate words within a line and
         // create a string with the duplicate words in every line.
         for (var nLineNum = 0; nLineNum < asLines.length; nLineNum++) {
         if (!asLines[nLineNum].length) continue;  // Ignore empty lines.
        
         // Split the line up into an array of word strings.
         var asWords = asLines[nLineNum].split(" ");
         var asSingleWords = new Array();
         var asMultipleWords = new Array();
        
         // Run a case sensitive word comparison from first to last word in line.
         for (var nActWord = 0; nActWord < asWords.length; nActWord++) {
         //new one
         if ( asSingleWords[asWords[nActWord]] == null ) {
         asSingleWords[asWords[nActWord]] = asWords[nActWord];
         }
         else {
         //counting would be posible here
         asMultipleWords[asWords[nActWord]] = asWords[nActWord];
         }
         }
        
         for (var temp in asSingleWords) {
         sSingle += temp + " ";
         }
         sSingle = sSingle.trim() + "\r\n";
         for (var temp in asMultipleWords) {
         sMultiple += temp + " ";
         }
         sMultiple = sMultiple.trim() + "\r\n";
         }
         // Output lines without duplicate words into a new file.
         UltraEdit.newFile();
         UltraEdit.activeDocument.unixMacToDos();
         UltraEdit.activeDocument.write(sSingle);
         UltraEdit.activeDocument.top();
         UltraEdit.newFile();
         UltraEdit.activeDocument.unixMacToDos();
         UltraEdit.activeDocument.write(sMultiple);
         UltraEdit.activeDocument.top();
         }
        }
        
        The order of the words may change in the rows (caused by associative arrays)!

        12
        Basic UserBasic User
        12

          Feb 15, 2012#4

          Thank you so much, that's perfect. But there's a problem that the separation between words in my text file is the tab characters (Because I copy from microsoft excel file to txt file). Could you fix this problem in your script. Thanks

          6,681583
          Grand MasterGrand Master
          6,681583

            Feb 15, 2012#5

            In the script are several " " respectively ' '. Replace the space character between the double or single quotes by \t and the scripts will work for tab separated words. You should additionally replace all space in the comments by tab.

            12
            Basic UserBasic User
            12

              Feb 15, 2012#6

              OK, Thanks a lot.

              6,681583
              Grand MasterGrand Master
              6,681583

                Feb 15, 2012#7

                Hi Jaretin,

                I doubt that your method with the "associative array" really results in less string comparisons. I suppose the opposite is the case. There are more comparisons, but they are done by the private functions of the array object and not visible.

                But before I start to explain why I think this method results in more string comparisons, this technique should not be used in principle as the article JavaScript "Associative Arrays" Considered Harmful referenced from Array docs on Mozilla Developer Network explains. There are other articles in WWW which warn also using arrays in this manner because JavaScript does not really support associative arrays, it just looks so.

                My method uses really an array of strings which is in real an array of pointers to strings and the strings are also arrays of characters. Your method uses arrays of objects. The objects have as member variables an identification string and the string value. So what happens on code

                Code: Select all

                asSingleWords[asWords[nActWord]] == null
                internally in the array. I don't know it for sure because not knowing the code of JavaScript core, but from a C/C++ programmers point of view I suppose an iterator loop is executed comparing the index string returned by asWords[nActWord] with the id string of every object in the array. So the current word is with string compares compared against all previous added words. This is like comparing current word against all words left in the line. Only this code line would most likely result in the same number of string compares.

                But depending on the result, the next code to execute is either

                Code: Select all

                asSingleWords[asWords[nActWord]] = asWords[nActWord];
                or

                Code: Select all

                asMultipleWords[asWords[nActWord]] = asWords[nActWord];
                Both code lines will most likely result in one more iteration from first to last object in the array with comparing the id strings of the objects with the index string to find the right object to store the string value, or add a new object with a new id string and new string value.

                An array working with strings as index can't be faster than an array working with an integer as index.

                However, I found it interesting that there was no error on execution of your script because of function trim(). According to String methods documentation this method of the String object was added with JavaScript 1.8.1 and IDM states in help that JavaScript 1.7 is implemented in UltraEdit. I wrote this little script:

                Code: Select all

                var sResult = "";
                if (typeof(String.trim) != "function") sResult = " not";
                UltraEdit.messageBox("String method trim is" + sResult + " supported.");
                I executed it and got displayed the message: "String method trim is supported." That is very interesting. It looks like IDM has updated to a new version of the JavaScript core without writing it anywhere. The questions are now: In which version was the JavaScript core updated? And which JavaScript version is now working in UltraEdit?

                21
                Basic UserBasic User
                21

                  Feb 16, 2012#8

                  Hi Mofi,

                  I would normally agree to your point of view, but I tested the following codes against a file containing 3300 lines, each row containing 33 columns.

                  Code: Select all

                  if (UltraEdit.document.length > 0)
                  {
                   var d = new Date();
                   var myrun = "" + d.toLocaleString() + "\r\n";
                   // Define the environment for the script.
                   UltraEdit.insertMode();
                   if (typeof(UltraEdit.columnModeOff) == "function") UltraEdit.columnModeOff();
                   else if (typeof(UltraEdit.activeDocument.columnModeOff) == "function") UltraEdit.activeDocument.columnModeOff();
                   UltraEdit.activeDocument.hexOff();
                   // Select all and load the file contents into an array of lines.
                   UltraEdit.activeDocument.selectAll();
                   
                   if (UltraEdit.activeDocument.isSel())
                   {
                   var asLines = UltraEdit.activeDocument.selection.split("\r\n");
                   UltraEdit.activeDocument.top();  // Discards the selection.
                  
                   var sSingle = "";
                   var sMultiple = "";
                   // Search in every line for duplicate words which are separated
                   // by a single space, remove all duplicate words within a line and
                   // create a string with the duplicate words in every line.
                   for (var nLineNum = 0; nLineNum < asLines.length; nLineNum++) {
                   if (!asLines[nLineNum].length) continue;  // Ignore empty lines.
                  
                   // Split the line up into an array of word strings.
                   var asWords = asLines[nLineNum].split("\t");
                   var asSingleWords = new Array();
                   var asMultipleWords = new Array();
                  
                   // Run a case sensitive word comparison from first to last word in line.
                   for (var nActWord = 0; nActWord < asWords.length; nActWord++) {
                   //new one
                   if ( asSingleWords[asWords[nActWord]] == null ) {
                   asSingleWords[asWords[nActWord]] = asWords[nActWord];
                   sSingle += temp + "\t";
                   }
                   else {
                   //counting would be posible here
                   asMultipleWords[asWords[nActWord]] = asWords[nActWord];
                   }
                   }
                  
                   sSingle = sSingle.substring(0,sSingle.length-2) + "\r\n";
                   for (var temp in asMultipleWords) {
                   sMultiple += temp + "\t";
                   }
                   sMultiple = sMultiple.trim() + "\r\n";
                   }
                   // Output lines without duplicate words into a new file.
                   UltraEdit.newFile();
                   UltraEdit.activeDocument.unixMacToDos();
                   UltraEdit.activeDocument.write(sSingle);
                   UltraEdit.activeDocument.top();
                   UltraEdit.newFile();
                   UltraEdit.activeDocument.unixMacToDos();
                   UltraEdit.activeDocument.write(sMultiple);
                   UltraEdit.activeDocument.top();
                   UltraEdit.newFile();
                   UltraEdit.activeDocument.unixMacToDos();
                   d = new Date();
                   myrun += "" + d.toLocaleString() + "\r\n";
                   UltraEdit.activeDocument.write(myrun);
                   UltraEdit.activeDocument.top();
                   }
                  }
                  
                  Thursday, February 16, 2012 07:47:12
                  Thursday, February 16, 2012 07:50:47

                  Code: Select all

                  if (UltraEdit.document.length > 0)
                  {
                   var d = new Date();
                   var myrun = "" + d.toLocaleString() + "\r\n";
                   // Define the environment for the script.
                   UltraEdit.insertMode();
                   if (typeof(UltraEdit.columnModeOff) == "function") UltraEdit.columnModeOff();
                   else if (typeof(UltraEdit.activeDocument.columnModeOff) == "function") UltraEdit.activeDocument.columnModeOff();
                   UltraEdit.activeDocument.hexOff();
                   // Select all and load the file contents into an array of lines.
                   UltraEdit.activeDocument.selectAll();
                   if (UltraEdit.activeDocument.isSel())
                   {
                   var asLines = UltraEdit.activeDocument.selection.split("\r\n");
                   UltraEdit.activeDocument.top();  // Discards the selection.
                  
                   var sDuplicates = "";  // Results string with the duplicate words.
                   var bNewLine = false;  // Defines appending a new line on results string.
                  
                   // Search in every line for duplicate words which are separated
                   // by a single space, remove all duplicate words within a line and
                   // create a string with the duplicate words in every line.
                   for (var nLineNum = 0; nLineNum < asLines.length; nLineNum++)
                   {
                   if (!asLines[nLineNum].length) continue;  // Ignore empty lines.
                  
                   // Split the line up into an array of word strings.
                   var asWords = asLines[nLineNum].split("\t");
                   var bDuplicateFound = false;
                  
                   // Run a case sensitive word comparison from first to last word in line.
                   for (var nActWord = 0; nActWord < asWords.length; nActWord++)
                   {
                   var bNewWord = true;
                   // Words left to current word must not be compared anymore.
                   var nCmpWord = nActWord + 1;
                   while (nCmpWord < asWords.length)
                   {
                   // Compare current word with all words right in the line.
                   if (asWords[nActWord] == asWords[nCmpWord])
                   {
                   // Remove the duplicate word from the array.
                   asWords.splice(nCmpWord,1);
                   // Report every duplicate word only once!
                   if (!bNewWord) continue;
                   bNewWord = false;
                   bDuplicateFound = true;
                   // Append the word to the string with the duplicate words.
                   if (!bNewLine)  // Append a space?
                   {               // Yes, but not on empty duplicates string.
                   if (sDuplicates.length) sDuplicates += '\t';
                   }
                   else            // Append a new line.
                   {
                   bNewLine = false;
                   sDuplicates += "\r\n";
                   }
                   // Append the word to the duplicates string.
                   sDuplicates += asWords[nActWord];
                   }
                   else nCmpWord++;   // Continue with next word in the line.
                   }
                   }
                   if (bDuplicateFound)     // Any duplicate word found in the line?
                   {
                   // Join the words together to a line without the duplicate words.
                   asLines[nLineNum] = asWords.join('\t');
                   bNewLine = true;
                   }
                   }
                   if (sDuplicates.length == 0)  // No duplicate word found?
                   {
                   UltraEdit.messageBox("No duplicate word found on any line!");
                   }
                   else
                   {
                   // Output lines without duplicate words into a new file.
                   UltraEdit.newFile();
                   UltraEdit.activeDocument.unixMacToDos();
                   var sAllLines = asLines.join("\r\n");
                   // Append a line termination if source file does not end with a line termination.
                   if (asLines[asLines.length-1] != "") sAllLines += "\r\n";
                   UltraEdit.activeDocument.write(sAllLines);
                   UltraEdit.activeDocument.top();
                   // Output the lines with the duplicate words found into a new file.
                   sDuplicates += "\r\n";
                   UltraEdit.newFile();
                   UltraEdit.activeDocument.unixMacToDos();
                   UltraEdit.activeDocument.write(sDuplicates);
                   UltraEdit.activeDocument.top();
                   UltraEdit.newFile();
                   UltraEdit.activeDocument.unixMacToDos();
                   d = new Date();
                   myrun += "" + d.toLocaleString() + "\r\n";
                   UltraEdit.activeDocument.write(myrun);
                   UltraEdit.activeDocument.top();
                   }
                   }
                  }
                  Thursday, February 16, 2012 07:51:11
                  Thursday, February 16, 2012 07:58:29

                  You just guess how an associative array is stored and managed. Maybe an btree is used and not an array? Whatever.

                  And yes, I was surprised too, that trim worked.

                  6,681583
                  Grand MasterGrand Master
                  6,681583

                    Feb 16, 2012#9

                    Okay, you proved that your version is definitely faster and so my assumptions on how "associative arrays" are internally handled were obviously wrong. Good to know for the future. Thanks.

                    12
                    Basic UserBasic User
                    12

                      Dec 03, 2012#10

                      Hi all, hi Mofi
                      in my example:

                      The text file like this:
                      Line 1: a b c d d e
                      Line 2: a b b c c d e e
                      Line 3: a b c d e e

                      1. New file like this:
                      Line 1: a b c d e
                      Line 2: a b c d e
                      Line 3: a b c d e

                      2. New file like this:
                      Line 1: d
                      Line 2: b c e
                      Line 3: e

                      now i need a new file like this:
                      3.file:
                      Line 1: a b c e
                      Line 2: a d
                      Line 3: a b c d

                      thanks for your help

                      6,681583
                      Grand MasterGrand Master
                      6,681583

                        Dec 03, 2012#11

                        I have taken the faster script from Jaretin and added the few code lines required to output into a third new file the lines with only unique words. The script runs on lines with words delimited by a horizontal tab and produces also files with lines with tab delimited words.

                        Code: Select all

                        if (UltraEdit.document.length > 0)
                        {
                           // Define the environment for the script.
                           UltraEdit.insertMode();
                           if (typeof(UltraEdit.columnModeOff) == "function") UltraEdit.columnModeOff();
                           else if (typeof(UltraEdit.activeDocument.columnModeOff) == "function") UltraEdit.activeDocument.columnModeOff();
                           UltraEdit.activeDocument.hexOff();
                           // Select all and load the file contents into an array of lines.
                           UltraEdit.activeDocument.selectAll();
                           if (UltraEdit.activeDocument.isSel())
                           {
                              var asLines = UltraEdit.activeDocument.selection.split("\r\n");
                              UltraEdit.activeDocument.top();  // Discards the selection.
                        
                              var sSingle = "";
                              var sMultiple = "";
                              var sUnique = "";
                        
                              // Search in every line for duplicate words which are separated
                              // by a single tab, remove all duplicate words within a line and
                              // create a string with the duplicate words in every line.
                              for (var nLineNum = 0; nLineNum < asLines.length; nLineNum++) {
                                 if (!asLines[nLineNum].length) continue;  // Ignore empty lines.
                        
                                 // Split the line up into an array of word strings.
                                 var asWords = asLines[nLineNum].split("\t");
                                 var asSingleWords = new Array();
                                 var asMultipleWords = new Array();
                        
                                 // Run a case sensitive word comparison from first to last word in line.
                                 for (var nActWord = 0; nActWord < asWords.length; nActWord++) {
                                    // Check if current word already present in array of single words.
                                    if ( asSingleWords[asWords[nActWord]] == null ) {
                                       asSingleWords[asWords[nActWord]] = asWords[nActWord];
                                    }
                                    else {
                                       // Word present more than once in this line. Counting would be possible here.
                                       asMultipleWords[asWords[nActWord]] = asWords[nActWord];
                                    }
                                 }
                        
                                 var bFirstWordInLine = true;
                                 for (var temp in asSingleWords) {
                                    sSingle += temp + "\t";
                                    if (asMultipleWords[temp] == null) {
                                       if (!bFirstWordInLine) sUnique += "\t";
                                       sUnique += temp;
                                       bFirstWordInLine = false;
                                    }
                                 }
                                 sSingle = sSingle.trim() + "\r\n";
                                 if (!bFirstWordInLine) sUnique += "\r\n";
                        
                                 for (var temp in asMultipleWords) {
                                    sMultiple += temp + "\t";
                                 }
                                 sMultiple = sMultiple.trim() + "\r\n";
                              }
                              // Output lines without duplicate words into a new file.
                              UltraEdit.newFile();
                              UltraEdit.activeDocument.unixMacToDos();
                              UltraEdit.activeDocument.write(sSingle);
                              UltraEdit.activeDocument.top();
                              // Output lines with only the duplicate words into a new file.
                              UltraEdit.newFile();
                              UltraEdit.activeDocument.unixMacToDos();
                              UltraEdit.activeDocument.write(sMultiple);
                              UltraEdit.activeDocument.top();
                              // Output lines with only the unique words into a new file.
                              UltraEdit.newFile();
                              UltraEdit.activeDocument.unixMacToDos();
                              UltraEdit.activeDocument.write(sUnique);
                              UltraEdit.activeDocument.top();
                           }
                        }

                        12
                        Basic UserBasic User
                        12

                          Dec 04, 2012#12

                          Thank you very much.