Notifications

Clear all

[Closed] Fast search through text file

Page 6 / 10 Prev Next

Jan 16, 2023 7:23 pm

don’t need to use so many variables, just use Dictionary with string key and integer value
then simply check if the dictionary has the key (first word) and if it does increment the counter
example or if you need to store strings along with the count another example

miauu

Jan 16, 2023 7:23 pm

Thank you.

If I find a way to use the python code inside maxscript and if the python code is faster than your c# code then I definitely will change the code.

denisT

Jan 16, 2023 7:23 pm

	words = 
	#(
		"alpha", "beta", "gama", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa", "lambda", 
		"mu", "nu", "xi", "omicron", "pi", "rho", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega"
	)

(
	cmd = python.import "__builtin__"
	unicode = cmd.unicode 
	lower = unicode.lower 

	t0 = timestamp()
	h0 = heapfree

	(
		wd = cmd.list words
		d = cmd.dict()
		for w in words do d[w] = 0
		f = cmd.open (DOWNLOAD_DIR_ + @"\50000LinesOfText.txt")
		ss = f.readlines()
		for k=1 to ss.count do
		(
			w = (unicode.split ss[k] ' \t' 1)[1]
			if w != undefined do
			(
				w = lower w
				if (n = d[w]) != undefined do d[w] = n + 1
			)
		)
	)

	format "time:% heap:%\n" (timestamp() - t0) (h0 - heapfree)
	d
)

time:214

for reference… I wrote it in c++
time:75 where 15 is the reading file

Serejah

Jan 16, 2023 7:23 pm

this one is pure python and it completes in a ~250ms on my ancient laptop.
But hell knows how to pass the data to mxs

import time
import re

words = [
		"alpha", "beta", "gama", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa", "lambda", 
		"mu", "nu", "xi", "omicron", "pi", "rho", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega"
]
words.sort()

data = dict.fromkeys( words )
for w in words:
    data[w] = [ 0, [] ]

t = time.time()

pattern = "^\s*(" + "|".join( words) + ")\\b"

print( pattern )

regex = re.compile( pattern, re.IGNORECASE )

line_number = 0
with open(r"C:\. . .\50000LinesOfText.txt", "r") as fp:
    while True:
        line_number += 1
        line = fp.readline()
        
        if not line:
            break

        match = regex.search( line )
        if match:            
            key = match.group(1).lower()
            if ( key in data ):
                data[key][0] += 1
                data[key][1].append( line_number )


print( str((time.time() - t))[0:5] + " sec." )
# print( data )

miauu

Jan 16, 2023 7:23 pm

Serejah, on my PC your python code is executed for ~120 ms when I collect the line numbers and the text of the lines. With your original code the time is ~115 ms.

The last code from Denis:
time:391 heap:18151864L

denisT

Jan 16, 2023 7:23 pm

here is C# version with dynamic assembly:

(
	cs_assembly = 
	(
	local source = @"
		using System;
		using System.IO;
		using System.Collections.Generic;

		public class TextProcessor
		{
			public Dictionary<string, int> data;
			public string[] keys;
			public int[] vals;
			
			public void ProcessFile(string file , string[] words)
			{	
				data = new Dictionary<string, int>();	
			
				var spaces = new char[]{' ','	'};
				string[] lines = File.ReadAllLines(file);
				if (lines != null)
				{
					foreach(string line in lines)
					{
						var word = line.TrimStart(spaces).Split()[0].ToLower();
						if (data.ContainsKey(word)) data[word]++;
						else data[word] = 1;
					}
				}
				keys = new string[data.Keys.Count];
				data.Keys.CopyTo(keys, 0);
				vals = new int[data.Values.Count];
				data.Values.CopyTo(vals, 0);
			}
		}"

		csharpProvider = dotnetobject "Microsoft.CSharp.CSharpCodeProvider"
		compilerParams = dotnetobject "System.CodeDom.Compiler.CompilerParameters"
		compilerParams.ReferencedAssemblies.AddRange #("System.dll");

		compilerParams.GenerateInMemory = on
		compilerResults = csharpProvider.CompileAssemblyFromSource compilerParams #(source)


		if (compilerResults.Errors.Count > 0 ) then
		(
			local errs = stringstream ""
			for i = 0 to (compilerResults.Errors.Count-1) do
			(
				local err = compilerResults.Errors.Item[i]
				format "Error:% Line:% Column:% %\n" err.ErrorNumber err.Line err.Column err.ErrorText to:errs
			)
			format "%\n" errs
			undefined
		)
		else
		(
			compilerResults.CompiledAssembly		
		)

	)

	gc()
	words = #("alpha", "beta", "gama", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega")

	t0 = timestamp()
	h0 = heapfree

	tp = cs_assembly.CreateInstance "TextProcessor"
	tp.ProcessFile (DOWNLOAD_DIR_ + @"\50000LinesOfText.txt") words
	
	t1 = timestamp()
	h1 = heapfree
	
	format "time:% heap:%\n" (t1 - t0) (h0 - h1)
	format "\tkeys:%\n" tp.keys
	format "\tnums:%\n" tp.vals
)

time:42 on my machine

miauu

Jan 16, 2023 7:23 pm

Thank you, Denis!
The fastest time I have is 58, the slowest is 80.

The code collect all “first” words on all lines. I have edited the 50000LinesOfText.txt file adding 3 new lines at the start and this is the result:

The words array does not include the “some” and “just”.

I am trying to update your code to work with string, not a file, using the Serejah code as an example.
https://dotnetfiddle.net/tid1Jd

3 Replies

denisT

(@denist)

Joined: 1 year ago

Posts: 0

Jan 16, 2023 7:23 pm

Reply to

miauu

Haha … I specially modified the code to work with a file

miauu

(@miauu)

Joined: 1 year ago

Posts: 0

Jan 16, 2023 7:23 pm

Reply to

denisT

Now I know how to use a file and a string.
Thank you.

Only thing to solve is to make the code to collect only the passed words. If I pass “kappa” the code will collect all.

denisT

(@denist)

Joined: 1 year ago

Posts: 0

Jan 16, 2023 7:23 pm

Reply to

miauu

			public void ProcessString(string str , String[] words)
			{	
				data = new Dictionary<string, int>();	
			
				var spaces = new char[]{' ','	'};
				using (System.IO.StringReader sr = new System.IO.StringReader(str))
				{
					string line;
					while ((line = sr.ReadLine()) != null)
					{
						var word = line.TrimStart(spaces).Split()[0].ToLower();
						if (Array.IndexOf(words, word) > -1)
						{
							if (data.ContainsKey(word)) data[word]++;
							else data[word] = 1;
						}
					}
				}
				keys = new string[data.Keys.Count];
				data.Keys.CopyTo(keys, 0);
				vals = new int[data.Values.Count];
				data.Values.CopyTo(vals, 0);
			}

change to above, and call:

tp.ProcessString strToCheck #("kappa")

denisT

Jan 16, 2023 7:23 pm

(
	cs_assembly = 
	(
	local source = @"
		using System;
		using System.IO;
		using System.Collections.Generic;

		public class TextProcessor
		{
			public Dictionary<string, int> data;
			public string[] keys;
			public int[] vals;
			
			public void ProcessFile(string file , string[] words)
			{	
				data = new Dictionary<string, int>();	
			
				var spaces = new char[]{' ','	'};
				string[] lines = File.ReadAllLines(file);
				if (lines != null)
				{
					foreach(string line in lines)
					{
						var word = line.TrimStart(spaces).Split()[0].ToLower();
						if (data.ContainsKey(word)) data[word]++;
						else data[word] = 1;
					}
				}
				keys = new string[data.Keys.Count];
				data.Keys.CopyTo(keys, 0);
				vals = new int[data.Values.Count];
				data.Values.CopyTo(vals, 0);
			}
			public void ProcessString(string str , string[] words)
			{	
				data = new Dictionary<string, int>();	
			
				var spaces = new char[]{' ','	'};
				using (System.IO.StringReader sr = new System.IO.StringReader(str))
				{
					string line;
					while ((line = sr.ReadLine()) != null)
					{
						var word = line.TrimStart(spaces).Split()[0].ToLower();
						if (data.ContainsKey(word)) data[word]++;
						else data[word] = 1;
					}
				}
				keys = new string[data.Keys.Count];
				data.Keys.CopyTo(keys, 0);
				vals = new int[data.Values.Count];
				data.Values.CopyTo(vals, 0);
			}			
		}"

		csharpProvider = dotnetobject "Microsoft.CSharp.CSharpCodeProvider"
		compilerParams = dotnetobject "System.CodeDom.Compiler.CompilerParameters"
		compilerParams.ReferencedAssemblies.AddRange #("System.dll");

		compilerParams.GenerateInMemory = on
		compilerResults = csharpProvider.CompileAssemblyFromSource compilerParams #(source)


		if (compilerResults.Errors.Count > 0 ) then
		(
			local errs = stringstream ""
			for i = 0 to (compilerResults.Errors.Count-1) do
			(
				local err = compilerResults.Errors.Item[i]
				format "Error:% Line:% Column:% %\n" err.ErrorNumber err.Line err.Column err.ErrorText to:errs
			)
			format "%\n" errs
			undefined
		)
		else
		(
			compilerResults.CompiledAssembly		
		)

	)

	gc()
	words = #("alpha", "beta", "gama", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega")

	t0 = timestamp()
	h0 = heapfree

	tp = cs_assembly.CreateInstance "TextProcessor"
	--tp.ProcessFile (DOWNLOAD_DIR_ + @"\50000LinesOfText.txt") words
	tp.ProcessString strToCheck words
	
	t1 = timestamp()
	h1 = heapfree
	
	format "time:% heap:%\n" (t1 - t0) (h0 - h1)
	format "\tkeys:%\n" tp.keys
	format "\tnums:%\n" tp.vals
)