Notifications
Clear all

[Closed] Fast search through text file

don’t need to use so many variables, just use Dictionary with string key and integer value
then simply check if the dictionary has the key (first word) and if it does increment the counter
example or if you need to store strings along with the count another example

Thank you.

If I find a way to use the python code inside maxscript and if the python code is faster than your c# code then I definitely will change the code.

	words = 
	#(
		"alpha", "beta", "gama", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa", "lambda", 
		"mu", "nu", "xi", "omicron", "pi", "rho", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega"
	)

(
	cmd = python.import "__builtin__"
	unicode = cmd.unicode 
	lower = unicode.lower 

	t0 = timestamp()
	h0 = heapfree

	(
		wd = cmd.list words
		d = cmd.dict()
		for w in words do d[w] = 0
		f = cmd.open (DOWNLOAD_DIR_ + @"\50000LinesOfText.txt")
		ss = f.readlines()
		for k=1 to ss.count do
		(
			w = (unicode.split ss[k] ' \t' 1)[1]
			if w != undefined do
			(
				w = lower w
				if (n = d[w]) != undefined do d[w] = n + 1
			)
		)
	)

	format "time:% heap:%\n" (timestamp() - t0) (h0 - heapfree)
	d
)

time:214

for reference… I wrote it in c++
time:75 where 15 is the reading file

this one is pure python and it completes in a ~250ms on my ancient laptop.
But hell knows how to pass the data to mxs

import time
import re

words = [
		"alpha", "beta", "gama", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa", "lambda", 
		"mu", "nu", "xi", "omicron", "pi", "rho", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega"
]
words.sort()

data = dict.fromkeys( words )
for w in words:
    data[w] = [ 0, [] ]

t = time.time()

pattern = "^\s*(" + "|".join( words) + ")\\b"

print( pattern )

regex = re.compile( pattern, re.IGNORECASE )

line_number = 0
with open(r"C:\. . .\50000LinesOfText.txt", "r") as fp:
    while True:
        line_number += 1
        line = fp.readline()
        
        if not line:
            break

        match = regex.search( line )
        if match:            
            key = match.group(1).lower()
            if ( key in data ):
                data[key][0] += 1
                data[key][1].append( line_number )


print( str((time.time() - t))[0:5] + " sec." )
# print( data )

Serejah, on my PC your python code is executed for ~120 ms when I collect the line numbers and the text of the lines. With your original code the time is ~115 ms.

The last code from Denis:
time:391 heap:18151864L

here is C# version with dynamic assembly:

(
	cs_assembly = 
	(
	local source = @"
		using System;
		using System.IO;
		using System.Collections.Generic;

		public class TextProcessor
		{
			public Dictionary<string, int> data;
			public string[] keys;
			public int[] vals;
			
			public void ProcessFile(string file , string[] words)
			{	
				data = new Dictionary<string, int>();	
			
				var spaces = new char[]{' ','	'};
				string[] lines = File.ReadAllLines(file);
				if (lines != null)
				{
					foreach(string line in lines)
					{
						var word = line.TrimStart(spaces).Split()[0].ToLower();
						if (data.ContainsKey(word)) data[word]++;
						else data[word] = 1;
					}
				}
				keys = new string[data.Keys.Count];
				data.Keys.CopyTo(keys, 0);
				vals = new int[data.Values.Count];
				data.Values.CopyTo(vals, 0);
			}
		}"

		csharpProvider = dotnetobject "Microsoft.CSharp.CSharpCodeProvider"
		compilerParams = dotnetobject "System.CodeDom.Compiler.CompilerParameters"
		compilerParams.ReferencedAssemblies.AddRange #("System.dll");

		compilerParams.GenerateInMemory = on
		compilerResults = csharpProvider.CompileAssemblyFromSource compilerParams #(source)


		if (compilerResults.Errors.Count > 0 ) then
		(
			local errs = stringstream ""
			for i = 0 to (compilerResults.Errors.Count-1) do
			(
				local err = compilerResults.Errors.Item[i]
				format "Error:% Line:% Column:% %\n" err.ErrorNumber err.Line err.Column err.ErrorText to:errs
			)
			format "%\n" errs
			undefined
		)
		else
		(
			compilerResults.CompiledAssembly		
		)

	)

	gc()
	words = #("alpha", "beta", "gama", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega")

	t0 = timestamp()
	h0 = heapfree

	tp = cs_assembly.CreateInstance "TextProcessor"
	tp.ProcessFile (DOWNLOAD_DIR_ + @"\50000LinesOfText.txt") words
	
	t1 = timestamp()
	h1 = heapfree
	
	format "time:% heap:%\n" (t1 - t0) (h0 - h1)
	format "\tkeys:%\n" tp.keys
	format "\tnums:%\n" tp.vals
)

time:42 on my machine

Thank you, Denis!
The fastest time I have is 58, the slowest is 80.

The code collect all “first” words on all lines. I have edited the 50000LinesOfText.txt file adding 3 new lines at the start and this is the result:

The words array does not include the “some” and “just”.

I am trying to update your code to work with string, not a file, using the Serejah code as an example.
https://dotnetfiddle.net/tid1Jd

3 Replies
(@denist)
Joined: 10 months ago

Posts: 0

Haha … I specially modified the code to work with a file

(@miauu)
Joined: 10 months ago

Posts: 0

Now I know how to use a file and a string.
Thank you.

Only thing to solve is to make the code to collect only the passed words. If I pass “kappa” the code will collect all.

(@denist)
Joined: 10 months ago

Posts: 0
			public void ProcessString(string str , String[] words)
			{	
				data = new Dictionary<string, int>();	
			
				var spaces = new char[]{' ','	'};
				using (System.IO.StringReader sr = new System.IO.StringReader(str))
				{
					string line;
					while ((line = sr.ReadLine()) != null)
					{
						var word = line.TrimStart(spaces).Split()[0].ToLower();
						if (Array.IndexOf(words, word) > -1)
						{
							if (data.ContainsKey(word)) data[word]++;
							else data[word] = 1;
						}
					}
				}
				keys = new string[data.Keys.Count];
				data.Keys.CopyTo(keys, 0);
				vals = new int[data.Values.Count];
				data.Values.CopyTo(vals, 0);
			}			

change to above, and call:

tp.ProcessString strToCheck #("kappa")
(
	cs_assembly = 
	(
	local source = @"
		using System;
		using System.IO;
		using System.Collections.Generic;

		public class TextProcessor
		{
			public Dictionary<string, int> data;
			public string[] keys;
			public int[] vals;
			
			public void ProcessFile(string file , string[] words)
			{	
				data = new Dictionary<string, int>();	
			
				var spaces = new char[]{' ','	'};
				string[] lines = File.ReadAllLines(file);
				if (lines != null)
				{
					foreach(string line in lines)
					{
						var word = line.TrimStart(spaces).Split()[0].ToLower();
						if (data.ContainsKey(word)) data[word]++;
						else data[word] = 1;
					}
				}
				keys = new string[data.Keys.Count];
				data.Keys.CopyTo(keys, 0);
				vals = new int[data.Values.Count];
				data.Values.CopyTo(vals, 0);
			}
			public void ProcessString(string str , string[] words)
			{	
				data = new Dictionary<string, int>();	
			
				var spaces = new char[]{' ','	'};
				using (System.IO.StringReader sr = new System.IO.StringReader(str))
				{
					string line;
					while ((line = sr.ReadLine()) != null)
					{
						var word = line.TrimStart(spaces).Split()[0].ToLower();
						if (data.ContainsKey(word)) data[word]++;
						else data[word] = 1;
					}
				}
				keys = new string[data.Keys.Count];
				data.Keys.CopyTo(keys, 0);
				vals = new int[data.Values.Count];
				data.Values.CopyTo(vals, 0);
			}			
		}"

		csharpProvider = dotnetobject "Microsoft.CSharp.CSharpCodeProvider"
		compilerParams = dotnetobject "System.CodeDom.Compiler.CompilerParameters"
		compilerParams.ReferencedAssemblies.AddRange #("System.dll");

		compilerParams.GenerateInMemory = on
		compilerResults = csharpProvider.CompileAssemblyFromSource compilerParams #(source)


		if (compilerResults.Errors.Count > 0 ) then
		(
			local errs = stringstream ""
			for i = 0 to (compilerResults.Errors.Count-1) do
			(
				local err = compilerResults.Errors.Item[i]
				format "Error:% Line:% Column:% %\n" err.ErrorNumber err.Line err.Column err.ErrorText to:errs
			)
			format "%\n" errs
			undefined
		)
		else
		(
			compilerResults.CompiledAssembly		
		)

	)

	gc()
	words = #("alpha", "beta", "gama", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega")

	t0 = timestamp()
	h0 = heapfree

	tp = cs_assembly.CreateInstance "TextProcessor"
	--tp.ProcessFile (DOWNLOAD_DIR_ + @"\50000LinesOfText.txt") words
	tp.ProcessString strToCheck words
	
	t1 = timestamp()
	h1 = heapfree
	
	format "time:% heap:%\n" (t1 - t0) (h0 - h1)
	format "\tkeys:%\n" tp.keys
	format "\tnums:%\n" tp.vals
)

Thank you.
This

(Array.IndexOf(words, word) > -1)

is the c# equivalent of findItem.

Can the dictionary in C# be constructed with 3 elements
1- word
2- line in which the word is found
3- text of that line.

Right now the code counts how many lines starts with an arbitrary word, but it does not collect the line numbers and the line text.

Page 6 / 10