Search for below 20 words article

Search for below 20 words article

12
Basic UserBasic User
12

    Apr 12, 2017#1

    Code: Select all

    article_5-1.xml
    article_8-1.xml
    article_10-1.xml
    article_18-1.xml
    article_24-1.xml
    article_30-1.xml
    article_36-1.xml
    article_42-1.xml
    article_5-1.xml
    article_8-1.xml
    article_10-1.xml
    article_18-1.xml
    article_24-1.xml
    article_30-1.xml
    article_36-1.xml
    article_42-1.xml
    ...
    I have a folder structure like above.
    I want to find which article contains below 20 words in <body.content>...</body.content>

    The exceptions are give below...

    Code: Select all

    <p><a name="5-1" class="replica-location"/></p>
    <content><img src="img_5-1.jpg" credit="Getty Images"/>Somenath</content>
    Above tags are not countable.
    article_5-1.zip (1.15 KiB)   73
    Please find the .zip for reference

    13
    Basic UserBasic User
    13

      Apr 27, 2017#2

      Please see this,
      I hope this helps you

      Code: Select all

      	UltraEdit.outputWindow.clear();
      	var sDirectory = UltraEdit.getString("Enter Path of Files = ",1);
      	var sDirectory1 = sDirectory.replace(/\\/g,"\\");
      
      	UltraEdit.perlReOn();
      	UltraEdit.frInFiles.searchInFilesTypes="*.xml";
      	UltraEdit.frInFiles.directoryStart=sDirectory1 + "\\";
      	UltraEdit.frInFiles.openMatchingFiles=false;
      	UltraEdit.frInFiles.ignoreHiddenSubs=true;
      	UltraEdit.frInFiles.filesToSearch=0;
      	UltraEdit.frInFiles.useEncoding=true;
      	UltraEdit.frInFiles.encoding=65001;  // The files are UTF-8 encoded!
      	UltraEdit.frInFiles.useOutputWindow=true;
      	UltraEdit.frInFiles.matchCase=true;
      	UltraEdit.frInFiles.matchWord=false;
      	UltraEdit.frInFiles.preserveCase=false;
      	UltraEdit.frInFiles.searchSubs=true;
      	UltraEdit.frInFiles.replaceAll=true;
      	UltraEdit.frInFiles.regExp=true;
      	
      	UltraEdit.frInFiles.find("<body.content>([\\S\\s]+?)</body.content>");
      	UltraEdit.outputWindow.copy();
      	UltraEdit.newFile();
      	UltraEdit.activeDocument.paste();
      
      	UltraEdit.insertMode();
      	UltraEdit.columnModeOff();
      	UltraEdit.activeDocument.hexOff();
      	UltraEdit.perlReOn();
      	UltraEdit.activeDocument.findReplace.mode=0;
      	UltraEdit.activeDocument.findReplace.matchCase=true;
      	UltraEdit.activeDocument.findReplace.matchWord=false;
      	UltraEdit.activeDocument.findReplace.searchDown=true;
      	if (typeof(UltraEdit.activeDocument.findReplace.searchInColumn) == "boolean") {
      		UltraEdit.activeDocument.findReplace.searchInColumn=false;
      	}
      	// Move caret to top of the file.
      	UltraEdit.activeDocument.top();
      
      	// Determine once type of line termination.
      	var sLineTerm = "\r\n";           // Default is DOS.
      	if (typeof(UltraEdit.activeDocument.lineTerminator) == "number") {
      		if (UltraEdit.activeDocument.lineTerminator == 2) sLineTerm = "\n";
      		else if (UltraEdit.activeDocument.lineTerminator == 3) sLineTerm = "\r";
      	}
      	else {
      		// This version of UE/UES does not offer line terminator property.
      		UltraEdit.activeDocument.findReplace.regExp=false;
      		if (!UltraEdit.activeDocument.findReplace.find(sLineTerm)) {
      			sLineTerm = "\n";           // Not DOS, perhaps UNIX.
      			if (!UltraEdit.activeDocument.findReplace.find(sLineTerm)) {
      				sLineTerm = "\r";        // Also not UNIX, perhaps MAC.
      				if (!UltraEdit.activeDocument.findReplace.find(sLineTerm)) {
      					sLineTerm = "\r\n";   // No line terminator, use DOS.
      				}
      			}
      		}
      		UltraEdit.activeDocument.top();
      	}
      
      	UltraEdit.activeDocument.findReplace.regExp=true;
      	UltraEdit.activeDocument.findReplace.replaceAll=true;
      	UltraEdit.activeDocument.findReplace.replaceInAllOpen=false;
      	UltraEdit.activeDocument.findReplace.replace('<p><a name="[^\r\n]*?</p>','');
      	UltraEdit.activeDocument.findReplace.replace('<content>[^\r\n]*?</content>','');
      	UltraEdit.activeDocument.findReplace.replace("^Find[^\r\n]*?$",'');
      	UltraEdit.activeDocument.findReplace.replace('Found[^\r\n]*?$','');
      	UltraEdit.activeDocument.findReplace.replace('found[^\r\n]*?$','');
      	UltraEdit.activeDocument.findReplace.replace('Search complete, ','');
      	UltraEdit.activeDocument.findReplace.replace('^.*\\\\([^<>]*?)\.xml.*: <body.content>','$1.xml\r\n<body.content>');
      	UltraEdit.activeDocument.findReplace.replace('<body.content>','//body.content');
      	UltraEdit.activeDocument.findReplace.replace('</body.content>','///body.content');
      	UltraEdit.activeDocument.findReplace.replace('<[^<>]*?>','');
      	UltraEdit.activeDocument.findReplace.replace('(\\r?\\n){2,}','$1');
      	UltraEdit.activeDocument.findReplace.replaceAll=false;
      	while (UltraEdit.activeDocument.findReplace.find("(?s)//body.content.*///body.content"))
      	{
      		var sBlock = UltraEdit.activeDocument.selection.replace(/\/\/body.content.*(?:\r\n|\n|\r)|\/\/\/body.content/g,"");
      		if (sBlock == "") continue;  // Ignore empty blocks.
      
      		// Verify if at least 1 line terminator is found in the remaining block.
      		var nLineCount = sBlock.indexOf(sLineTerm);
      		if (nLineCount < 0)
      		{	// If no line terminator found, the block
      			nLineCount = 1;           // contains just a part of a line counted
      		}                            // nevertheless as 1 line.
      		else
      		{
      			// Block contains 1 or more lines. Split the block up into
      			// an array of strings each containing an entire line.
      			var asLines = sBlock.split(sLineTerm);
      			nLineCount = asLines.length;
      			// If the block ends with a line termination (last string is
      			// empty), decrease the number of lines by 1 to get correct result.
      			if (asLines[nLineCount-1] == "") nLineCount--;
      		}
      
      		// Replace all sequences of whitspace characters (spaces,
      		// tabs, line terminators, form-feeds) by a single space.
      		// This expression defines which string is interpreted as "word".
      		var sText = sBlock.replace(/\s+/g," ");
      
      		// Split the string of words into an array of strings each containing
      		// one word. The number of strings is equal the number of words.
      		var asWords = sText.split(" ");
      		var nWordCount = asWords.length;
      
      		// But if text ends with a space character (last string empty),
      		// the count must be decreased by one to get correct word count.
      		if (asWords[nWordCount-1] == "") nWordCount--;
      		// Also if the text starts with a space character (first string
      		// empty), the count must be decreased by one for correct count.
      		if (asWords[0] == "") nWordCount--;
      
      		// Write the result into the file below the still selected block.
      		// Selection is discarded with moving the caret to end of line
      		// even if the caret is already at end of the marker line.
      		WordCount = nWordCount.toString(10);
      
      		if(WordCount < 20)
      		{
      			UltraEdit.activeDocument.write("Total words: " + nWordCount);
      		}
      	}
      	UltraEdit.activeDocument.top();
      	UltraEdit.activeDocument.findReplace.replaceAll=true;
      	UltraEdit.activeDocument.findReplace.replace("----------------------------------------" + sLineTerm + ".+.xml" + sLineTerm + "//body.content[\\S\\s]+?///body.content" + sLineTerm,"");
      	if (UltraEdit.activeDocument.isEof() == 1)
      	{
      		UltraEdit.activeDocument.write("NO File Present in this Directory.");
      	}
      	UltraEdit.saveAs(sDirectory1 + "\\Report.txt");
      

      12
      Basic UserBasic User
      12

        Apr 28, 2017#3

        At first, I would like to thank you. It is working quite nicely but there will be some changes to be done. I am uploading a file which is below 20 words article but the script is ignore the file. Kindly help me.

        Somenath
        article_17-1.zip (1.31 KiB)   176
        Please find the attachment

        13
        Basic UserBasic User
        13

          Apr 28, 2017#4

          First of all sorry,
          I miss this case.

          Code: Select all

          	UltraEdit.outputWindow.clear();
          	var sDirectory = UltraEdit.getString("Enter Path of Files = ",1);
          	var sDirectory1 = sDirectory.replace(/\\/g,"\\");
          
          	UltraEdit.perlReOn();
          	UltraEdit.frInFiles.searchInFilesTypes="*.xml";
          	UltraEdit.frInFiles.directoryStart=sDirectory1 + "\\";
          	UltraEdit.frInFiles.openMatchingFiles=false;
          	UltraEdit.frInFiles.ignoreHiddenSubs=true;
          	UltraEdit.frInFiles.filesToSearch=0;
          	UltraEdit.frInFiles.useEncoding=true;
          	UltraEdit.frInFiles.encoding=65001;  // The files are UTF-8 encoded!
          	UltraEdit.frInFiles.useOutputWindow=true;
          	UltraEdit.frInFiles.matchCase=true;
          	UltraEdit.frInFiles.matchWord=false;
          	UltraEdit.frInFiles.preserveCase=false;
          	UltraEdit.frInFiles.searchSubs=true;
          	UltraEdit.frInFiles.replaceAll=true;
          	UltraEdit.frInFiles.regExp=true;
          	
          	UltraEdit.frInFiles.find("<body.content>([\\S\\s]+?)</body.content>");
          	UltraEdit.outputWindow.copy();
          	UltraEdit.newFile();
          	UltraEdit.activeDocument.paste();
          
          	UltraEdit.insertMode();
          	UltraEdit.columnModeOff();
          	UltraEdit.activeDocument.hexOff();
          	UltraEdit.perlReOn();
          	UltraEdit.activeDocument.findReplace.mode=0;
          	UltraEdit.activeDocument.findReplace.matchCase=true;
          	UltraEdit.activeDocument.findReplace.matchWord=false;
          	UltraEdit.activeDocument.findReplace.searchDown=true;
          	if (typeof(UltraEdit.activeDocument.findReplace.searchInColumn) == "boolean") {
          		UltraEdit.activeDocument.findReplace.searchInColumn=false;
          	}
          	// Move caret to top of the file.
          	UltraEdit.activeDocument.top();
          
          	// Determine once type of line termination.
          	var sLineTerm = "\r\n";           // Default is DOS.
          	if (typeof(UltraEdit.activeDocument.lineTerminator) == "number") {
          		if (UltraEdit.activeDocument.lineTerminator == 2) sLineTerm = "\n";
          		else if (UltraEdit.activeDocument.lineTerminator == 3) sLineTerm = "\r";
          	}
          	else {
          		// This version of UE/UES does not offer line terminator property.
          		UltraEdit.activeDocument.findReplace.regExp=false;
          		if (!UltraEdit.activeDocument.findReplace.find(sLineTerm)) {
          			sLineTerm = "\n";           // Not DOS, perhaps UNIX.
          			if (!UltraEdit.activeDocument.findReplace.find(sLineTerm)) {
          				sLineTerm = "\r";        // Also not UNIX, perhaps MAC.
          				if (!UltraEdit.activeDocument.findReplace.find(sLineTerm)) {
          					sLineTerm = "\r\n";   // No line terminator, use DOS.
          				}
          			}
          		}
          		UltraEdit.activeDocument.top();
          	}
          
          	UltraEdit.activeDocument.findReplace.regExp=true;
          	UltraEdit.activeDocument.findReplace.replaceAll=true;
          	UltraEdit.activeDocument.findReplace.replaceInAllOpen=false;
          	UltraEdit.activeDocument.findReplace.replace('<p><a name="[^\r\n]*?</p>','');
          	UltraEdit.activeDocument.findReplace.replace('<content>[^\r\n]*?</content>','');
          	UltraEdit.activeDocument.findReplace.replace("^Find[^\r\n]*?$",'');
          	UltraEdit.activeDocument.findReplace.replace('Found[^\r\n]*?$','');
          	UltraEdit.activeDocument.findReplace.replace('found[^\r\n]*?$','');
          	UltraEdit.activeDocument.findReplace.replace('Search complete, ','');
          	UltraEdit.activeDocument.findReplace.replace('^.*\\\\([^<>]*?)\.xml.*: <body.content>','$1.xml\r\n<body.content>');
          	UltraEdit.activeDocument.findReplace.replace('<body.content>','//body.content');
          	UltraEdit.activeDocument.findReplace.replace('</body.content>','///body.content');
          	UltraEdit.activeDocument.findReplace.replace('<[^<>]*?>','');
          	UltraEdit.activeDocument.findReplace.replace('(\\r?\\n){2,}','$1');
          	UltraEdit.activeDocument.findReplace.replaceAll=false;
          	while (UltraEdit.activeDocument.findReplace.find("(?s)//body.content.*///body.content"))
          	{
          		var sBlock = UltraEdit.activeDocument.selection.replace(/\/\/body.content.*(?:\r\n|\n|\r)|\/\/\/body.content/g,"");
          		
          		// Verify if at least 1 line terminator is found in the remaining block.
          		var nLineCount = sBlock.indexOf(sLineTerm);
          		if (nLineCount < 0)
          		{	// If no line terminator found, the block
          			nLineCount = 1;           // contains just a part of a line counted
          		}                            // nevertheless as 1 line.
          		else
          		{
          			// Block contains 1 or more lines. Split the block up into
          			// an array of strings each containing an entire line.
          			var asLines = sBlock.split(sLineTerm);
          			nLineCount = asLines.length;
          			// If the block ends with a line termination (last string is
          			// empty), decrease the number of lines by 1 to get correct result.
          			if (asLines[nLineCount-1] == "") nLineCount--;
          		}
          
          		// Replace all sequences of whitspace characters (spaces,
          		// tabs, line terminators, form-feeds) by a single space.
          		// This expression defines which string is interpreted as "word".
          		var sText = sBlock.replace(/\s+/g," ");
          
          		// Split the string of words into an array of strings each containing
          		// one word. The number of strings is equal the number of words.
          		var asWords = sText.split(" ");
          		var nWordCount = asWords.length;
          
          		// But if text ends with a space character (last string empty),
          		// the count must be decreased by one to get correct word count.
          		if (asWords[nWordCount-1] == "") nWordCount--;
          		
          		// Write the result into the file below the still selected block.
          		// Selection is discarded with moving the caret to end of line
          		// even if the caret is already at end of the marker line.
          		WordCount = nWordCount.toString(10);
          
          		if(WordCount < 20)
          		{
          			UltraEdit.activeDocument.write("Total words: " + nWordCount);
          		}
          	}
          	UltraEdit.activeDocument.top();
          	UltraEdit.activeDocument.findReplace.replaceAll=true;
          	UltraEdit.activeDocument.findReplace.replace("----------------------------------------" + sLineTerm + ".+.xml" + sLineTerm + "//body.content[\\S\\s]+?///body.content" + sLineTerm,"");
          	if (UltraEdit.activeDocument.isEof() == 1)
          	{
          		UltraEdit.activeDocument.write("NO File Present in this Directory.");
          	}
          	UltraEdit.saveAs(sDirectory1 + "\\Report.txt");
          
          I hope this script overcome your problem.

          12
          Basic UserBasic User
          12

            May 01, 2017#5

            Many many thanks to you.
            It's working really nice.

            Somenath