Strpbrk() In Python
Solution 1:
I may have gone a little overboard here, but the tl;dr is that is the original code is actually faster than (EDIT: macOS's) strpbrk()
, but some strpbrk()
implementations may be faster!
str.count()
uses this bundle of strange and beautiful magic in its guts – no wonder it's fast.
The full code is available at https://github.com/akx/so55822235
Python code
These approaches are in pure Python, including OP's original
defgc_characters_original(haystack):
gc_characters = 0for c in ("c", "C", "g", "G"):
gc_characters += haystack.count(c)
return gc_characters
defgc_characters_counter(haystack):
counter = Counter(haystack)
returnsum(counter.get(c, 0) for c in ["c", "C", "g", "G"])
defgc_characters_manual(haystack):
gc_characters = 0for x in haystack:
if x in ("c", "C", "g", "G"):
gc_characters += 1return gc_characters
defgc_characters_iters(haystack):
gc_characters = haystack.count("c") + haystack.count("C") + haystack.count("g") + haystack.count("G")
return gc_characters
Cython extension wrapping strpbrk()
from libc.string cimport strpbrk
cdef int _count(char* s, char *key):
assert s isnot NULL, "byte string value is NULL"
cdef int n = 0
cdef char* pch = strpbrk (s, key)
while pch isnot NULL:
n += 1
pch = strpbrk (pch + 1, key)
return n
def count(s, key):
return _count(s, key)
...
defgc_characters_cython(haystack_bytes):
return charcount_cython.count(haystack_bytes, b"cCgG")
Handmade C extension wrapping strpbrk()
#define PY_SSIZE_T_CLEAN#include<Python.h>#include<string.h>staticunsignedintcount(constchar *str, constchar *key) {
unsignedint n = 0;
char *pch = strpbrk(str, key);
while (pch != NULL) {
n++;
pch = strpbrk(pch + 1, key);
}
return n;
}
static PyObject *charcount_count(PyObject *self, PyObject *args) {
constchar *str, *key;
Py_ssize_t strl, keyl;
if (!PyArg_ParseTuple(args, "s#s#", &str, &strl, &key, &keyl)) {
PyErr_SetString(PyExc_RuntimeError, "invalid arguments");
returnNULL;
}
int n = count(str, key);
return PyLong_FromLong(n);
}
static PyMethodDef CharCountMethods[] = {
{"count", charcount_count, METH_VARARGS,
"Count the total occurrences of any s2 characters in s1"},
{NULL, NULL, 0, NULL},
};
staticstructPyModuleDefspammodule = {PyModuleDef_HEAD_INIT, "charcount",
NULL, -1, CharCountMethods};
PyMODINIT_FUNC PyInit_charcount(void) { return PyModule_Create(&spammodule); }
...
defgc_characters_cext_b(haystack_bytes):
return charcount.count(haystack_bytes, b"cCgG")
defgc_characters_cext_u(haystack):
return charcount.count(haystack, "cCgG")
Measurements
On my Mac, counting cCgG
in an one-million-character string of random letters, i.e.
haystack = "".join(random.choice(string.ascii_letters) for x inrange(1_000_000))
haystack_bytes = haystack.encode()
print("original", timeit.timeit(lambda: gc_characters_original(haystack), number=100))
print("unrolled", timeit.timeit(lambda: gc_characters_iters(haystack), number=100))
print("cython", timeit.timeit(lambda: gc_characters_cython(haystack_bytes), number=100))
print("c extension, bytes", timeit.timeit(lambda: gc_characters_cext_b(haystack_bytes), number=100))
print("c extension, unicode", timeit.timeit(lambda: gc_characters_cext_u(haystack), number=100))
print("manual loop", timeit.timeit(lambda: gc_characters_manual(haystack), number=100))
print("counter", timeit.timeit(lambda: gc_characters_counter(haystack), number=100))
yields these results:
original 0.34033612700000004
unrolled 0.33661798900000006
cython 0.6542106270000001c extension, bytes 0.46668797900000003c extension, unicode 0.4761082090000004
manual loop 11.625232557
counter 7.0389275090000005
So, unless the strpbrk()
implementation in my mac's libc
is horribly underpowered (EDIT: which it is), it's just best to use .count()
.
EDIT
I added glibc's strcspn()
/strpbrk()
and it's startlingly faster than the näive version of strpbrk()
shipped with macOS:
original 0.329256
unrolled 0.333872
cython 0.433299c extension, bytes 0.432552c extension, unicode 0.437332c extension glibc, bytes 0.169704<-- new
c extension glibc, unicode 0.158153<-- new
glibc
also has SSE2 and SSE4 versions of the functions, which would probably be even faster still.
EDIT 2
I went back to this one more time since I had an epiphany about how glibc's strcspn()
's clever lookup table could be used for character counting:
size_tfastcharcount(constchar *str, constchar *haystack) {
size_t count = 0;
// Prepare lookup table.// It will contain 1 for all characters in the haystack.unsignedchar table[256] = {0};
unsignedchar *ts = (unsignedchar *)haystack;
while(*ts) table[*ts++] = 1;
unsignedchar *s = (unsignedchar *)str;
#define CHECK_CHAR(i) { if(!s[i]) break; count += table[s[i]]; }for(;;) {
CHECK_CHAR(0);
CHECK_CHAR(1);
CHECK_CHAR(2);
CHECK_CHAR(3);
s += 4;
}
#undef CHECK_CHARreturn count;
}
The results are very impressive, outperforming the glibc implementation 4-fold and the original Python implementation 8.5-fold.
original|6.463880sec/2000 iter|309iter/sunrolled|6.378582sec/2000 iter|313iter/scythonlibc|8.443358sec/2000 iter|236iter/scythonglibc|2.936697sec/2000 iter|681iter/scythonfast|0.766082sec/2000 iter|2610 iter/scextension,bytes|8.373438sec/2000 iter|238iter/scextension,unicode|8.394805sec/2000 iter|238iter/scextensionglib,bytes|2.988184sec/2000 iter|669iter/scextensionglib,unicode|2.992429sec/2000 iter|668iter/scextensionfast,bytes|0.754072sec/2000 iter|2652 iter/scextensionfast,unicode|0.762074sec/2000 iter|2624 iter/s
Solution 2:
.count
will iterate over haystack
every time you call it - but is heavily optimized over the alternatives I suggest here. It depends on how many characters are in your real case. You can try
from collections import Counter
cnt = Counter(haystack)
gc_characters = sum(cnt.get(e, 0) for e in ['c', 'C', 'g', 'G']])
as this will iterate over the string once and store the counts of every occurring character. It may be marginally faster to only look for the characters you care about and using a set for those such characters for faster __contains__
.
gc_chars = {'c', 'C', 'g', 'G'}
counts = {e: 0forein gc_chars}
forcin gc_chars:
if c in gc_chars:
counts[c] += 1
gc_characters = sum(counts.values())
If you provide more details of the composition of hastack
and how often this is called, we could try to help you more.
Caching
Another idea is that if hastack
is frequently the same, you could perhaps keep an in-memory cache of the answer
from functools import lru_cache
@lru_cachedefhaystack_metric(hastack):
returnsum(haystack.count(c) for c in ['c', 'C', 'g', 'G']))
(with whatever implementation you settle on). You could also explore ctypes - but I have little experience with it.
Post a Comment for "Strpbrk() In Python"