don’t need to use so many variables, just use Dictionary with string key and integer value
then simply check if the dictionary has the key (first word) and if it does increment the counter
example or if you need to store strings along with the count another example
Thank you.
If I find a way to use the python code inside maxscript and if the python code is faster than your c# code then I definitely will change the code.
words =
#(
"alpha", "beta", "gama", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa", "lambda",
"mu", "nu", "xi", "omicron", "pi", "rho", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega"
)
(
cmd = python.import "__builtin__"
unicode = cmd.unicode
lower = unicode.lower
t0 = timestamp()
h0 = heapfree
(
wd = cmd.list words
d = cmd.dict()
for w in words do d[w] = 0
f = cmd.open (DOWNLOAD_DIR_ + @"\50000LinesOfText.txt")
ss = f.readlines()
for k=1 to ss.count do
(
w = (unicode.split ss[k] ' \t' 1)[1]
if w != undefined do
(
w = lower w
if (n = d[w]) != undefined do d[w] = n + 1
)
)
)
format "time:% heap:%\n" (timestamp() - t0) (h0 - heapfree)
d
)
time:214
for reference… I wrote it in c++
time:75 where 15 is the reading file
this one is pure python and it completes in a ~250ms on my ancient laptop.
But hell knows how to pass the data to mxs
import time
import re
words = [
"alpha", "beta", "gama", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa", "lambda",
"mu", "nu", "xi", "omicron", "pi", "rho", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega"
]
words.sort()
data = dict.fromkeys( words )
for w in words:
data[w] = [ 0, [] ]
t = time.time()
pattern = "^\s*(" + "|".join( words) + ")\\b"
print( pattern )
regex = re.compile( pattern, re.IGNORECASE )
line_number = 0
with open(r"C:\. . .\50000LinesOfText.txt", "r") as fp:
while True:
line_number += 1
line = fp.readline()
if not line:
break
match = regex.search( line )
if match:
key = match.group(1).lower()
if ( key in data ):
data[key][0] += 1
data[key][1].append( line_number )
print( str((time.time() - t))[0:5] + " sec." )
# print( data )
Serejah, on my PC your python code is executed for ~120 ms when I collect the line numbers and the text of the lines. With your original code the time is ~115 ms.
The last code from Denis:
time:391 heap:18151864L
here is C# version with dynamic assembly:
(
cs_assembly =
(
local source = @"
using System;
using System.IO;
using System.Collections.Generic;
public class TextProcessor
{
public Dictionary<string, int> data;
public string[] keys;
public int[] vals;
public void ProcessFile(string file , string[] words)
{
data = new Dictionary<string, int>();
var spaces = new char[]{' ',' '};
string[] lines = File.ReadAllLines(file);
if (lines != null)
{
foreach(string line in lines)
{
var word = line.TrimStart(spaces).Split()[0].ToLower();
if (data.ContainsKey(word)) data[word]++;
else data[word] = 1;
}
}
keys = new string[data.Keys.Count];
data.Keys.CopyTo(keys, 0);
vals = new int[data.Values.Count];
data.Values.CopyTo(vals, 0);
}
}"
csharpProvider = dotnetobject "Microsoft.CSharp.CSharpCodeProvider"
compilerParams = dotnetobject "System.CodeDom.Compiler.CompilerParameters"
compilerParams.ReferencedAssemblies.AddRange #("System.dll");
compilerParams.GenerateInMemory = on
compilerResults = csharpProvider.CompileAssemblyFromSource compilerParams #(source)
if (compilerResults.Errors.Count > 0 ) then
(
local errs = stringstream ""
for i = 0 to (compilerResults.Errors.Count-1) do
(
local err = compilerResults.Errors.Item[i]
format "Error:% Line:% Column:% %\n" err.ErrorNumber err.Line err.Column err.ErrorText to:errs
)
format "%\n" errs
undefined
)
else
(
compilerResults.CompiledAssembly
)
)
gc()
words = #("alpha", "beta", "gama", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega")
t0 = timestamp()
h0 = heapfree
tp = cs_assembly.CreateInstance "TextProcessor"
tp.ProcessFile (DOWNLOAD_DIR_ + @"\50000LinesOfText.txt") words
t1 = timestamp()
h1 = heapfree
format "time:% heap:%\n" (t1 - t0) (h0 - h1)
format "\tkeys:%\n" tp.keys
format "\tnums:%\n" tp.vals
)
time:42 on my machine
Thank you, Denis!
The fastest time I have is 58, the slowest is 80.
The code collect all “first” words on all lines. I have edited the 50000LinesOfText.txt file adding 3 new lines at the start and this is the result:
The words array does not include the “some” and “just”.
I am trying to update your code to work with string, not a file, using the Serejah code as an example.
https://dotnetfiddle.net/tid1Jd
Now I know how to use a file and a string.
Thank you.
Only thing to solve is to make the code to collect only the passed words. If I pass “kappa” the code will collect all.
public void ProcessString(string str , String[] words)
{
data = new Dictionary<string, int>();
var spaces = new char[]{' ',' '};
using (System.IO.StringReader sr = new System.IO.StringReader(str))
{
string line;
while ((line = sr.ReadLine()) != null)
{
var word = line.TrimStart(spaces).Split()[0].ToLower();
if (Array.IndexOf(words, word) > -1)
{
if (data.ContainsKey(word)) data[word]++;
else data[word] = 1;
}
}
}
keys = new string[data.Keys.Count];
data.Keys.CopyTo(keys, 0);
vals = new int[data.Values.Count];
data.Values.CopyTo(vals, 0);
}
change to above, and call:
tp.ProcessString strToCheck #("kappa")
(
cs_assembly =
(
local source = @"
using System;
using System.IO;
using System.Collections.Generic;
public class TextProcessor
{
public Dictionary<string, int> data;
public string[] keys;
public int[] vals;
public void ProcessFile(string file , string[] words)
{
data = new Dictionary<string, int>();
var spaces = new char[]{' ',' '};
string[] lines = File.ReadAllLines(file);
if (lines != null)
{
foreach(string line in lines)
{
var word = line.TrimStart(spaces).Split()[0].ToLower();
if (data.ContainsKey(word)) data[word]++;
else data[word] = 1;
}
}
keys = new string[data.Keys.Count];
data.Keys.CopyTo(keys, 0);
vals = new int[data.Values.Count];
data.Values.CopyTo(vals, 0);
}
public void ProcessString(string str , string[] words)
{
data = new Dictionary<string, int>();
var spaces = new char[]{' ',' '};
using (System.IO.StringReader sr = new System.IO.StringReader(str))
{
string line;
while ((line = sr.ReadLine()) != null)
{
var word = line.TrimStart(spaces).Split()[0].ToLower();
if (data.ContainsKey(word)) data[word]++;
else data[word] = 1;
}
}
keys = new string[data.Keys.Count];
data.Keys.CopyTo(keys, 0);
vals = new int[data.Values.Count];
data.Values.CopyTo(vals, 0);
}
}"
csharpProvider = dotnetobject "Microsoft.CSharp.CSharpCodeProvider"
compilerParams = dotnetobject "System.CodeDom.Compiler.CompilerParameters"
compilerParams.ReferencedAssemblies.AddRange #("System.dll");
compilerParams.GenerateInMemory = on
compilerResults = csharpProvider.CompileAssemblyFromSource compilerParams #(source)
if (compilerResults.Errors.Count > 0 ) then
(
local errs = stringstream ""
for i = 0 to (compilerResults.Errors.Count-1) do
(
local err = compilerResults.Errors.Item[i]
format "Error:% Line:% Column:% %\n" err.ErrorNumber err.Line err.Column err.ErrorText to:errs
)
format "%\n" errs
undefined
)
else
(
compilerResults.CompiledAssembly
)
)
gc()
words = #("alpha", "beta", "gama", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega")
t0 = timestamp()
h0 = heapfree
tp = cs_assembly.CreateInstance "TextProcessor"
--tp.ProcessFile (DOWNLOAD_DIR_ + @"\50000LinesOfText.txt") words
tp.ProcessString strToCheck words
t1 = timestamp()
h1 = heapfree
format "time:% heap:%\n" (t1 - t0) (h0 - h1)
format "\tkeys:%\n" tp.keys
format "\tnums:%\n" tp.vals
)
Thank you.
This
(Array.IndexOf(words, word) > -1)
is the c# equivalent of findItem
.
Can the dictionary in C# be constructed with 3 elements
1- word
2- line in which the word is found
3- text of that line.
Right now the code counts how many lines starts with an arbitrary word, but it does not collect the line numbers and the line text.