Results of findDupLines.js from Macros & Scripts downloads page

Results of findDupLines.js from Macros & Scripts downloads page

1581
Power UserPower User
1581

    Apr 20, 2018#1

    I downloaded the script file findDupLines.js from UltraEdit macros and scripts download page.

    Why are lines 13 and 14 displayed as "Duplicate"?
    ue_dupLines.png (48.42KiB)
    UE 26.20.0.74 German / Win 10 x 64 Pro

    6,686585
    Grand MasterGrand Master
    6,686585

      Apr 21, 2018#2

      Well, script findDupLines.js is indeed simple as written in first line by the anonymous user who contributed this script and has some serious issues.
      1. Control of case sensitivity with variable caseSens does not work at all.
      2. The script does not really check if an entire line is a duplicate of another entire line. It just searches if string of current line is found anywhere below in another line in file.
      3. The script does not search for multiple duplicates of a line and instead reports multiple times duplicates, for example line 3 and 5 are duplicate and line 5 and 8 are duplicate on lines 3, 5 and 8 are partly or completely identical.
      4. The output of the script is not good for evaluating the results.
      I wanted to quickly improve this script and fix the issues. But final script is rewritten more or less completely.

      Code: Select all

      // This is a complex script which searches for duplicate lines.
      
      // THE DUPLICATE CHECK IS CASE SENSITIVE!
      
      // Change the following variable to 0 to disable case sensitivity.
      
      var nCaseSensitive = 1;
      
      if (UltraEdit.document.length > 0)
      {
         UltraEdit.insertMode();
         if (typeof(UltraEdit.columnModeOff) == "function") UltraEdit.columnModeOff();
         else if (typeof(UltraEdit.activeDocument.columnModeOff) == "function") UltraEdit.activeDocument.columnModeOff();
         UltraEdit.activeDocument.top();
      
         UltraEdit.ueReOn();
         UltraEdit.activeDocument.findReplace.mode=0;
         UltraEdit.activeDocument.findReplace.matchCase=false;
         UltraEdit.activeDocument.findReplace.matchWord=false;
         UltraEdit.activeDocument.findReplace.regExp=false;
         UltraEdit.activeDocument.findReplace.searchDown=true;
         if (typeof(UltraEdit.activeDocument.findReplace.searchInColumn) == "boolean")
         {
            UltraEdit.activeDocument.findReplace.searchInColumn=false;
         }
      
         UltraEdit.outputWindow.write("Beginning duplicate check...");
      
         if (nCaseSensitive)
         {
            UltraEdit.activeDocument.findReplace.matchCase = true;
            UltraEdit.outputWindow.write("Case sensitivity enabled");
         }
         else
         {
            UltraEdit.activeDocument.findReplace.matchCase = false;
            UltraEdit.outputWindow.write("Case sensitivity disabled");
         }
      
         var nDuplicates = 0;
         var nLineNumber = 1;
         var nStartIndex = 0;
         var anDuplicateLines = [];
         var sFullFileName = UltraEdit.activeDocument.path;
      
         while (!UltraEdit.activeDocument.isEof())
         {
            var nIndex = nStartIndex;
            while(nIndex < anDuplicateLines.length)
            {
               if (anDuplicateLines[nIndex] > nLineNumber) break;
               if (nLineNumber == anDuplicateLines[nIndex])
               {
                  nStartIndex = nIndex + 1;
                  nIndex = -1;
                  break;
               }
               nIndex++;
            }
            if (nIndex >= 0)
            {
               UltraEdit.activeDocument.selectLine();
               var sCurrentLine = UltraEdit.activeDocument.selection.replace(/[\r\n]+/g,"");
      
               if (sCurrentLine.length)
               {
                  var bFirstDuplicate = true;
                  var sFindLine = sCurrentLine.replace(/\^/g,"^^");
                  while (UltraEdit.activeDocument.findReplace.find(sFindLine))
                  {
                     UltraEdit.activeDocument.selectLine();
                     var sFoundLine = UltraEdit.activeDocument.selection.replace(/[\r\n]+/g,"");
      
                     if (sFoundLine.length == sCurrentLine.length)
                     {
                        var nDuplicateLine = UltraEdit.activeDocument.currentLineNum;
      
                        if (bFirstDuplicate)
                        {
                           UltraEdit.outputWindow.write("\n" + sFullFileName + "(" + nLineNumber + "): " + sCurrentLine);
                           bFirstDuplicate = false;
                        }
      
                        UltraEdit.outputWindow.write(sFullFileName + "(" + nDuplicateLine + "): " + sFoundLine);
      
                        for (nIndex = nStartIndex; nIndex < anDuplicateLines.length; nIndex++)
                        {
                           if (anDuplicateLines[nIndex] > nDuplicateLine) break;
                        }
      
                        anDuplicateLines.splice(nIndex,0,nDuplicateLine);
                        nDuplicates++;
                     }
                  }
               }
            }
      
            nLineNumber++;
            UltraEdit.activeDocument.gotoLine(nLineNumber,1);
            if (UltraEdit.activeDocument.currentLineNum != nLineNumber) break;
         }
      
         UltraEdit.outputWindow.write("\nDuplicate check complete.");
         UltraEdit.outputWindow.showWindow(true);
      
         if (nDuplicates > 0)
         {
            UltraEdit.outputWindow.write(nDuplicates + " duplicate line" + ((nDuplicates > 1) ? "s" : "") + " found in this file.");
         }
         else
         {
            UltraEdit.outputWindow.write("No duplicate lines found in this file.");
         }
      }
      
      Well, for small files it would be definitely better to load entire file into memory into an array of strings and do search for duplicate lines using this array. That would be faster because of no window update during script execution.
      Best regards from an UC/UE/UES for Windows user from Austria

      1581
      Power UserPower User
      1581

        Apr 23, 2018#3

        Great, Mofi - thanks.

        Will the old file on the download page be replaced with your new one?
        UE 26.20.0.74 German / Win 10 x 64 Pro

        6,686585
        Grand MasterGrand Master
        6,686585

          Apr 23, 2018#4

          I could improve the code further and comment it and then I could send it to IDM for uploading it on their server. But at the moment I am updating the scripts, macros and wordfiles contributed by me in the past for the downloads pages. I will create a new forum topic if I really come to decision contributing a find duplicates script for UltraEdit user community.
          Best regards from an UC/UE/UES for Windows user from Austria

          1032
          Power UserPower User
          1032

            Apr 24, 2018#5

            I'm also interest to see new versions of findDupLines.js.
            As I could see, there are a simple and a complex version for that task.
            Output log for complex version is far better to analyse the results.
            I think that would be a good opportunity to unify those versions.

            But it would be very interesting to see also a script to remove duplicate lines that are not successive.
            About this, I second the opinion that the script found at Script page has bugs and give me the message "No duplicates found in this file; no lines removed!" even when there are several duplicate lines on the file.

            Mofi's suggestion of Perl RegExp is very handy and do the right job successfully.

            But how to remove duplicate lines that aren't successive?
            I suspect that a couple of changes on findDupLines.js (complex version) is enough to achieve that.

            6,686585
            Grand MasterGrand Master
            6,686585

              Apr 24, 2018#6

              Here is the script modified to remove all duplicates in file with exception of empty lines.

              Code: Select all

              // This is a simple script which removes duplicate lines.
              
              // THE DUPLICATE CHECK IS CASE SENSITIVE!
              
              // Change the following variable to 0 to disable case sensitivity.
              
              var nCaseSensitive = 1;
              
              if (UltraEdit.document.length > 0)
              {
                 UltraEdit.insertMode();
                 if (typeof(UltraEdit.columnModeOff) == "function") UltraEdit.columnModeOff();
                 else if (typeof(UltraEdit.activeDocument.columnModeOff) == "function") UltraEdit.activeDocument.columnModeOff();
                 UltraEdit.activeDocument.top();
              
                 UltraEdit.ueReOn();
                 UltraEdit.activeDocument.findReplace.mode=0;
                 UltraEdit.activeDocument.findReplace.matchCase=false;
                 UltraEdit.activeDocument.findReplace.matchWord=false;
                 UltraEdit.activeDocument.findReplace.regExp=false;
                 UltraEdit.activeDocument.findReplace.searchDown=true;
                 if (typeof(UltraEdit.activeDocument.findReplace.searchInColumn) == "boolean")
                 {
                    UltraEdit.activeDocument.findReplace.searchInColumn=false;
                 }
              
                 UltraEdit.outputWindow.write("Beginning duplicate removal...");
              
                 if (nCaseSensitive)
                 {
                    UltraEdit.activeDocument.findReplace.matchCase = true;
                    UltraEdit.outputWindow.write("Case sensitivity enabled");
                 }
                 else
                 {
                    UltraEdit.activeDocument.findReplace.matchCase = false;
                    UltraEdit.outputWindow.write("Case sensitivity disabled");
                 }
              
                 var nDuplicates = 0;
                 var nLineNumber = 1;
                 var sFullFileName = UltraEdit.activeDocument.path;
              
                 while (!UltraEdit.activeDocument.isEof())
                 {
                    UltraEdit.activeDocument.selectLine();
                    var sCurrentLine = UltraEdit.activeDocument.selection.replace(/[\r\n]+/g,"");
              
                    if (sCurrentLine.length)
                    {
                       var nLineDuplicates = 0;
                       var sFindLine = sCurrentLine.replace(/\^/g,"^^");
                       while (UltraEdit.activeDocument.findReplace.find(sFindLine))
                       {
                          UltraEdit.activeDocument.selectLine();
                          var sFoundLine = UltraEdit.activeDocument.selection.replace(/[\r\n]+/g,"");
              
                          if (sFoundLine.length == sCurrentLine.length)
                          {
                             UltraEdit.activeDocument.deleteText();
                             nLineDuplicates++;
                             nDuplicates++;
                          }
                       }
                       if (nLineDuplicates)
                       {
                          UltraEdit.outputWindow.write("\n" + sFullFileName + "(" + nLineNumber + "): " + sCurrentLine);
                          UltraEdit.outputWindow.write("Removed " + nLineDuplicates + " duplicate" + ((nLineDuplicates > 1) ? "s" : "") +
                                                       " of this line");
                       }
                    }
              
                    nLineNumber++;
                    UltraEdit.activeDocument.gotoLine(nLineNumber,1);
                    if (UltraEdit.activeDocument.currentLineNum != nLineNumber) break;
                 }
              
                 UltraEdit.outputWindow.write("\nDuplicate removal complete.");
                 UltraEdit.outputWindow.showWindow(true);
              
                 if (nDuplicates > 0)
                 {
                    UltraEdit.outputWindow.write(nDuplicates + " duplicate line" + ((nDuplicates > 1) ? "s" : "") + " removed from this file.");
                 }
                 else
                 {
                    UltraEdit.outputWindow.write("No duplicate lines found in this file.");
                 }
              }
              
              Best regards from an UC/UE/UES for Windows user from Austria

              1032
              Power UserPower User
              1032

                Apr 24, 2018#7

                Thank you, Mofi.
                Your script works very well.

                Comparing it with findDupLine.js, I could see too many changes and I would not be able to do that alone.


                👏