A question about threads and mutex locks in C...

trist007 · 12-26-2011, 10:53 PM

Here's my current program that uses threads. It takes 3 arguments, a starting long long number, the number of primes to find, and the number of threads to use. So for example say the arguments are 9 2 1, then the output will be

Code:

9
11

I notice if I use more than 3 threads, then I start getting weird output. Are my mutex locks in the right place? What would be a good strategy in placing mutex locks? Any suggestions on where to move my mutex locks?

Code:

#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>

// global functions
void *crunch();
void *common_proc();
long long getNumber();
int isPrimeLongLong(long long n);

// global variables
long long n, *archive;
int m, l, primes_found = 0;
pthread_t *tid;
pthread_t common;
pthread_mutex_t lock1 = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t lock2 = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t lock3 = PTHREAD_MUTEX_INITIALIZER;

int main(int argc, char *argv[])
{
   // local variables
   long i = 0;

   if(argc != 4)
   {
      fprintf(stderr, "Usage: %s <long long number> <# of primes> <# of threads>\n", argv[0]);
      return 1;
   }

   // print introduction and arguments
   fprintf(stderr, "Primecruncher by\ntrist007\n");

   // long long starting number
   n = atoll(argv[1]);

   // number of primes to find
   m = atoi(argv[2]);

   // global archive of primes found
   archive = malloc(m * sizeof(long long));

   // number of threads to start
   l = atoi(argv[3]);

   // create calculation threads
   tid = malloc(l * sizeof(pthread_t));

   for(i = 0; i < l; i++)
   {
      if(pthread_create(&tid[i], NULL, crunch, NULL))
         return 1;
   }

   // create common thread
   if(pthread_create(&common, NULL, common_proc, NULL))
      return 1;

   // wait for common thread to exit
   pthread_join(common, NULL);

   // view the archive
   for(i = 0; i < m; i++)
   {
      printf("%llu\n", archive[i]);
   }

   // free memory
   free(archive);
   free(tid);

   // close program
   return 0;
}

void *crunch()
{
   long long number;
   pthread_mutex_lock(&lock1);
   while(number = getNumber())
   {
      pthread_mutex_unlock(&lock1);
      if(isPrimeLongLong(number))
      {
         pthread_mutex_lock(&lock2);
         archive[primes_found] = number;
         primes_found++;
         pthread_mutex_unlock(&lock2);
      }
   }
}

void *common_proc()
{
   int i;

   // wait for crunch threads to exit
   for(i = 0; i < l; i++)
   {
      pthread_join(tid[i], NULL);
   }
}

long long getNumber()
{
   pthread_mutex_lock(&lock3);
   if(primes_found < m)
   {
      pthread_mutex_unlock(&lock3);
      return n++;
   }
   else
      pthread_mutex_unlock(&lock3);
      return 0;
}

the isPrimeLongLong() is in another file. It returns 1 if the long long number is prime and 0 if it is not prime.

trist007 · 12-26-2011, 11:19 PM

Here is the code for the isPrimeLongLong() in case some of you guys want to run it.

Code:

#include <math.h>

int isPrimeInt(int n)  {
   int k;
   for(k = 2; k < sqrt(n); k++)  {
      if ((n % k) == 0)
         return 1;
   }
   return 0;
}

int isPrimeLongLong(long long n)  {
    long long k;
   for(k = 2; k < sqrt(n); k++)  {
      if ((n % k) == 0)
         return 0;
   }
   return 1;
}

Compile with "-lm -lpthread"

trist007 · 12-27-2011, 04:33 AM

There I think I got it. Do you guys see any issues with concurrency?

Code:

#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>

// global functions
void *crunch();
void *common_proc();
long long getNumber();
int isPrimeLongLong(long long n);
void bubbleSort(long long *);

// global variables
long long n, *archive;
int m, l, primes_found = 0;
pthread_t *tid;
pthread_t common;
pthread_mutex_t lock1 = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t lock2 = PTHREAD_MUTEX_INITIALIZER;

int main(int argc, char *argv[])
{
   // local variables
   long i = 0;

   if(argc != 4)
   {
      fprintf(stderr, "Usage: %s <long long number> <# of primes> <# of threads>\n", argv[0]);
      return 1;
   }

   // print introduction and arguments
   fprintf(stderr, "Primecruncher by\nTristan Gonzalez\n");

   // long long starting number
   n = atoll(argv[1]);

   // number of primes to find
   m = atoi(argv[2]);

   // global archive of primes found
   archive = malloc(m * sizeof(long long));

   // number of threads to start
   l = atoi(argv[3]);

   // create calculation threads
   tid = malloc(l * sizeof(pthread_t));

   for(i = 0; i < l; i++)
   {
      if(pthread_create(&tid[i], NULL, crunch, NULL))
         return 1;
   }

   // create common thread
   if(pthread_create(&common, NULL, common_proc, NULL))
      return 1;

   // wait for common thread to exit
   pthread_join(common, NULL);

   // sort the archive
   bubbleSort(archive);

   // view the archive
   for(i = 0; i < m; i++)
   {
      printf("%llu\n", archive[i]);
   }

   // free memory
   free(archive);
   free(tid);

   // close program
   return 0;
}

void *crunch()
{
   long long number;
   while(1)
   {
      pthread_mutex_lock(&lock1);
      number = getNumber();
      pthread_mutex_unlock(&lock1);
      if(number != 0)
      {
         if(isPrimeLongLong(number))
         {
            pthread_mutex_lock(&lock2);
            archive[primes_found] = number;
            primes_found++;
            pthread_mutex_unlock(&lock2);
         }
      }
      else
      {
         pthread_exit(NULL);
      }
   }
}

void *common_proc()
{
   int i;

   // wait for crunch threads to exit
   for(i = 0; i < l; i++)
   {
      pthread_join(tid[i], NULL);
   }
}

long long getNumber()
{
   if(primes_found < m)
   {
      return n++;
   }
   else
      return 0;
}

void bubbleSort(long long *archive)  {
   long long tmp;
   int a, b;
   for(a = 1; a < m; ++a)
      for(b = m-1; b >= a; --b) {
         /* compare adjacent elements */
         if(archive[ b - 1] > archive[ b ]) {
            /* exchange elements */
            tmp = archive[ b - 1];
            archive[ b - 1] = archive[ b ];
            archive[ b ] = tmp;
         }
      }
}

skoona · 12-27-2011, 07:54 AM

Your second program looks fine with one exception. Your variable names of (n, m, l) are too short and not descriptive, making your code hard to read. Consider using full words; like next_number, number_of_primes, number_of_threads.

I will add that you seem to have figured out how to use mutexs correctly. Your first program was using them wrong and calling unlock too often.

trist007 · 12-27-2011, 08:34 AM

Skoona, thanks for taking a look. Yea that's a good idea about variable names. I'll do that. I ran my program a number of times last night and noticed two things.

1. With the input 9 10 8, the output is

Code:

Primecruncher by
trist007
9
11
13
17
19
23
25
29
31
37

but every once and a while I get

Code:

Primecruncher by
trist007
11
13
17
19
23
25
29
31
37
41

Something gets ovewritten, so I'm thinking I still have a concurrency issue, but don't know where.

2. It segfaults every once and awhile. It displays the archive and then segfaults. The error suggests that it's segfaulting during my two free statements. You see any reason why it would segfault on those two statements?

Code:

Primecruncher by
trist007
9
11
13
17
19
23
25
29
31
37
*** glibc detected *** ./prime: free(): invalid next size (fast): 0x0000000000602070 ***
======= Backtrace: =========
/lib64/libc.so.6(+0x78f85)[0x7f161c67af85]
/lib64/libc.so.6(cfree+0x73)[0x7f161c67ed93]
./prime[0x400b77]
/lib64/libc.so.6(__libc_start_main+0xfd)[0x7f161c620e5d]
./prime[0x4008c9]
======= Memory map: ========
00400000-00402000 r-xp 00000000 08:11 10489159                           /home/rgonzale/dev/current/prime
00601000-00602000 rw-p 00001000 08:11 10489159                           /home/rgonzale/dev/current/prime
00602000-00623000 rw-p 00000000 00:00 0                                  [heap]
7f1610000000-7f1610021000 rw-p 00000000 00:00 0 
7f1610021000-7f1614000000 ---p 00000000 00:00 0 
7f1617be4000-7f1617bf9000 r-xp 00000000 08:04 3276913                    /usr/lib64/libgcc_s.so.1
7f1617bf9000-7f1617df8000 ---p 00015000 08:04 3276913                    /usr/lib64/libgcc_s.so.1
7f1617df8000-7f1617df9000 rw-p 00014000 08:04 3276913                    /usr/lib64/libgcc_s.so.1
7f1617df9000-7f1617dfa000 ---p 00000000 00:00 0 
7f1617dfa000-7f16185fa000 rw-p 00000000 00:00 0 
7f16185fa000-7f16185fb000 ---p 00000000 00:00 0 
7f16185fb000-7f1618dfb000 rw-p 00000000 00:00 0 
7f1618dfb000-7f1618dfc000 ---p 00000000 00:00 0 
7f1618dfc000-7f16195fc000 rw-p 00000000 00:00 0 
7f16195fc000-7f16195fd000 ---p 00000000 00:00 0 
7f16195fd000-7f1619dfd000 rw-p 00000000 00:00 0 
7f161c602000-7f161c79d000 r-xp 00000000 08:04 3014729                    /lib64/libc-2.13.so
7f161c79d000-7f161c99d000 ---p 0019b000 08:04 3014729                    /lib64/libc-2.13.so
7f161c99d000-7f161c9a1000 r--p 0019b000 08:04 3014729                    /lib64/libc-2.13.so
7f161c9a1000-7f161c9a2000 rw-p 0019f000 08:04 3014729                    /lib64/libc-2.13.so
7f161c9a2000-7f161c9a8000 rw-p 00000000 00:00 0 
7f161c9a8000-7f161c9c0000 r-xp 00000000 08:04 3014743                    /lib64/libpthread-2.13.so
7f161c9c0000-7f161cbc0000 ---p 00018000 08:04 3014743                    /lib64/libpthread-2.13.so
7f161cbc0000-7f161cbc1000 r--p 00018000 08:04 3014743                    /lib64/libpthread-2.13.so
7f161cbc1000-7f161cbc2000 rw-p 00019000 08:04 3014743                    /lib64/libpthread-2.13.so
7f161cbc2000-7f161cbc6000 rw-p 00000000 00:00 0 
7f161cbc6000-7f161cc4a000 r-xp 00000000 08:04 3014733                    /lib64/libm-2.13.so
7f161cc4a000-7f161ce49000 ---p 00084000 08:04 3014733                    /lib64/libm-2.13.so
7f161ce49000-7f161ce4a000 r--p 00083000 08:04 3014733                    /lib64/libm-2.13.so
7f161ce4a000-7f161ce4b000 rw-p 00084000 08:04 3014733                    /lib64/libm-2.13.so
7f161ce4b000-7f161ce6c000 r-xp 00000000 08:04 3014771                    /lib64/ld-2.13.so
7f161d03e000-7f161d041000 rw-p 00000000 00:00 0 
7f161d069000-7f161d06b000 rw-p 00000000 00:00 0 
7f161d06b000-7f161d06c000 r--p 00020000 08:04 3014771                    /lib64/ld-2.13.so
7f161d06c000-7f161d06e000 rw-p 00021000 08:04 3014771                    /lib64/ld-2.13.so
7fffee198000-7fffee1b9000 rw-p 00000000 00:00 0                          [stack]
7fffee1ff000-7fffee200000 r-xp 00000000 00:00 0                          [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]
Aborted (core dumped)

As for number 1, I'm going to try and malloc memory for each thread to hold the variable number. That way I don't have to protect it. Actually since the long long number variable was created in the thread function, each thread will have that variable on its own stack. So I shouldn't have to protect it?
As for number 2, do I need to typecast the free statements?

ntubski · 12-27-2011, 06:32 PM

Code:

void *crunch()
{
...
      pthread_mutex_lock(&lock1);
      number = getNumber(); // reads prime_found
      pthread_mutex_unlock(&lock1);
...
            pthread_mutex_lock(&lock2);
            archive[primes_found] = number; // ERROR: primes_found may be greater than m here!
            primes_found++; // writes prime_found
            pthread_mutex_unlock(&lock2);
...
}

long long getNumber()
{
   if(primes_found < m)
   {
      return n++;
   }
   else
      return 0;
}

Quote:

Originally Posted by trist007

There I think I got it. Do you guys see any issues with concurrency?

You are using different locks for reading and writing primes_found, which means it's effectively unprotected access.

Quote:

2. It segfaults every once and awhile.

You are writing past the end of archive.

tuxdev · 12-27-2011, 08:06 PM

I've got a few suggestions to reduce headaches.

1. Use separate "archive" arrays for each thread. This helps reduce memory sharing between threads, which eliminates the need for a lot of synchronization. It also would likely help performance because the more you need to synchronize, the less benefits you get from multithreading. Also, to do this, I recommend using the void* argument in pthread_create that gets passed into the thread callback, this sort of thing is why it exists.
2. Put the lock on getNumber inside the function instead. It makes it more obvious that it's a critical section and protected properly, and eliminates any possibility of forgetting to lock.
3. Eliminate the common thread. There's straight up no reason for it to exist when you can do the joins from main().
4. Eliminate your usage of global variables. You might not be able to get rid of all of them, but the more you do, the less you have to think about unintended data sharing.

trist007 · 12-28-2011, 01:32 PM

Great ideas guys thank you. I'm learning so much. I will use the same mutex lock when read/write to a shared variable.

Three questions.

1. I'm going to use separate archive arrays for each thread. How would you approach the situation where the arguments are to find 10 primes and use 8 threads? Each thread do 1 prime and the first/last thread do 1 more? Is there a smarter way?

2. Just to clarify, if a variable is declared inside the thread function then it will only be in that thread's stack and does not need to be protected even if it has the same variable name as the the variable in the other threads correct?

3. How would I use the mutex locks inside the getNumber() function?

Code:

long long getNumber()
{
   if(primes_found < primes_to_find)
   {
      return n++;
   }
   else
      return 0;
}

Code:

long long getNumber()
{
   pthread_mutex_lock(&lock1);
   if(primes_found < primes_to_find)
   {
      pthread_mutex_unlock(&lock1);
      return n++;
   }
   else
      pthread_mutex_unlock(&lock1);
      return 0;
}

I can't place the mutex unlocks after the returns because they would not get executed. Would this work? Or would I need to redesign the function so that the locks don't go in between if conditions?

sundialsvcs · 12-28-2011, 03:30 PM

Interesting though this thread might be ... you are merely wasting the computer's time.

The calculation of prime numbers is a strictly compute-bound process. No input/output is involved. The speed at which you are able to calculate new primes is determined by "the raw speed of the CPU, less overhead." Given that you cannot make the CPU run faster, you need to minimize the overhead ... by totally eliminating the use of threads.

Always remember this: threading divides the CPU resource; it does not multiply it.

When you do get to an actual "valid" use of mutexes and threads, observe the following principles:

The names of each mutex should be very descriptive.
Put the mutex calls into a function that requires them; don't write functions that will only do the right thing when they are called under the right conditions.

skoona · 12-28-2011, 05:55 PM

trist007,

(1) I would not use separate archive arrays for this. It would make the simple example too complex. If your wanting to up the ante here, consider creating a thread pool feed by an asynchronous queue, and have each thread write the result back via an answer queue. of course each request/response should be an allocated memory structure. Also, consider increasing the size of the work request; maybe having each thread compute up to 10 primes per request.

(2) you are correct; local vars don't need protection.

(3) A lock is no good if you don't honor it. never act on a protected value outside of the lock.

Code:

long long getNumber()
{
   pthread_mutex_lock(&lock1);
   if(primes_found < primes_to_find)
   {
      pthread_mutex_unlock(&lock1);
      return n++;
   }
   else
      pthread_mutex_unlock(&lock1);
      return 0;
}

do this instead:

Code:

long long getNumber()
{
   pthread_mutex_lock(&lock1);
   if(primes_found < primes_to_find)
   {
      long long value = n++;
      pthread_mutex_unlock(&lock1);
      return value;
   }
   else
      pthread_mutex_unlock(&lock1);
      return 0;
}

tuxdev · 12-29-2011, 11:41 AM

Quote:

The calculation of prime numbers is a strictly compute-bound process. No input/output is involved. The speed at which you are able to calculate new primes is determined by "the raw speed of the CPU, less overhead." Given that you cannot make the CPU run faster, you need to minimize the overhead ... by totally eliminating the use of threads.

I'm assuming this is homework/simple exercise to get what multithreading is all about. Also, minimizing overhead is nice and all, but I've got all these idle cores just doing nothing without multithreading...

Quote:

(1) I would not use separate archive arrays for this. It would make the simple example too complex. If your wanting to up the ante here, consider creating a thread pool feed by an asynchronous queue, and have each thread write the result back via an answer queue. of course each request/response should be an allocated memory structure. Also, consider increasing the size of the work request; maybe having each thread compute up to 10 primes per request

Uh, the entire reason why I suggested multiple archive arrays is because it's simpler and requires way less locking. In multithreading, the biggest "evil"/source of errors/source of headaches is shared memory. Yeah, a master/slave system would be snazzy, but that's conceptually complicated and not quite as easy to get right as just not sharing memory. Also, the "right" way of doing an async queue would involve lockless structures, which is way out of pretty much all of our leagues.

This is how I'd lock getNumber()

Code:

long long getNumber()
{
   pthread_mutex_lock(&lock1);
   long long ret = 0;
   if(primes_found < primes_to_find)
   {
      ret = n++;
   }
   pthread_mutex_unlock(&lock1);
   return ret;
}

The else statement here looks like it's supposed to do both actions, but in reality the else only includes the unlock and the return is actually outside the whole if-else. It's a good habit to always put {} around blocks so that you don't accidentally create a bug like this.

Code:

   else
      pthread_mutex_unlock(&lock1);
      return 0;

Nominal Animal · 12-29-2011, 04:11 PM

Lockless structures are not that hard, if you use the atomic built-ins provided by the compiler. For example, these are provided by all recent C compilers I've used. GCC does/will also support C++11-type atomic operations, but for now, I'd still use the legacy ones.

Think about what you want your program to do. The way I read it, you want the program to use N threads (N specified by the user) to find prime numbers, starting from 1, until K prime numbers have been found (and K is also specified by the user).

I would personally use a thread pool, very much like skoona described.

You would need one atomic (unsigned) integer variable, integer_to_be_tested, which initially is of course one. It describes the next value that must be tested to find out whether it is prime or not. Each thread uses __sync_fetch_and_add(&integer_to_be_tested,1) to read it. (The function returns the current value, but the variable is also incremented by one. The fact that this is atomic means that there is no way any other thread or CPU core will ever access integer_to_be_testedbetween the read and the increment; it just cannot happen. The end result is that no matter how the threads are ordered, the first thread will get 1, the second thread 2, the third thread 3, and so on, absolutely reliably.)

You would also have one other atomic (unsigned) integer variable, primes_found, and an array of (unsigned) integers (capable of holding K+N primes -- I'll explain later why), say prime[]. Note that the integers in this array do not need to be atomic.

Each thread would first check if primes_found>=K. If so, then all work is done, and the worker thread can exit. Otherwise, the thread would use __sync_fetch_and_add(&integer_to_be_tested,1) to find out which integer it should check for primality. If it is not a prime, then the thread would restart (at checking primes_found ). If it is a prime, then the thread would use __sync_fetch_and_add(&primes_found,1) to find the index of prime[] where to save the prime number. Because no two threads will use the same index, the array entries do not need to be atomic. After saving the prime, the thread will restart.

A sharp reader will notice that at this point, there is no guarantee that the primes in the primes[] array are in ascending order. It is not immediately obvious, but we do have the K first primes. You see, the scheme actually yields up to K+N primes. The desired K first primes will be in the array; you just have to sort it first to find which ones they are.

Also, note that integer_to_be_tested may be increased by up to N-1 from the highest actually tested (unsigned) integer. You can compensate by always decreasing it by N-1 (but not below the largest prime found). This is less than optimal, but it is probably a very good strategy. (It is too late in the evening for me to decide whether it is safe to just decrease it in the thread atomically, after it notes the limit has been reached already.)

The above is only sensible for the brute force approach. I would not use it.

In a real life situation, you would obviously want to apply some prime number sieve. Here, tuxdev's point is extremely important. Most implementations fail to account for CPU cache effects, especially something called cache-line ping-pong: Here, each "cache line" (some small number of bytes, typically 16, 32, 64, or 128 bytes; depends on the processor) is owned by one CPU core. When multiple threads (running on different cores) access the same cache line, the CPU must move the data from one core to another. This introduces delays. If it happens only rarely, it is no problem, but assume two cores access the same part in a tight loop -- and they are likely to, if you are using a sieve! --: the CPU may end up transferring the ownership of the cache line after every single access (well, almost!), making it terribly slow.

The real trick is then how to make the sieve data structure synchronous (so that all threads see the same sieve), but separate to each thread/CPU core (so that there is no cacheline ping-pong).

It turns out that for something like the Sieve of Eratosthenes the separation is trivial:

Create a bit array, one bit per nonnegative integer up to some limit K (bits 0..K-1). Initialize it to all ones (one meaning prime, zero composite). Do it for each thread separately.

This time you'll only need one shared atomic unsigned integer, i. Since 1 is considered prime, initially set i=2.

Each thread will again use __sync_fetch_and_add(&i,1) to find their increment, then clear (set to zero) bits 2i, 3i, 4i, .. ni<K, in their own array only. Since the arrays are private to each thread, there is no need to use atomic operations, and there is no risk of cacheline ping-pong. Obviously when i>=K, all the work is done, and the worker threads should exit.

After the worker threads have exited you'll have N separate sieves. Fortunately, all you need to do is AND the sieves together -- in the main (non-worker) thread -- to get the actual, real sieve. If the nth bit is set, then n is prime.

Of course, this sieve approach does not match your original directives. This one answers the question "Which positive integers between 2 and K-1 are primes?".

Also, this works best if the sieve is sized just right to fit in the per-core cache. (A clever coder might use a loop and the TSC counter to see at which point accessing an array gets suddenly much slower, to deduce it. I'd probably just let the user specify it instead, and only add the clever bit if my users demanded it.)

Also, you don't need to start the bit array at zero; if F is the first number in the sieve you're interested in, then you can start clearing the bits at i×ceil(F/i) (but i must still start at 2, I believe).

Finally, if you wanted a program to find ever-increasing primes using the "chunked" sieve technique I described above, you could reserve one of the workers to do the sieve composition (AND) and outputting the primes, while the other threads already work on the next chunk. You'll need 2N-1 bit arrays, but for best CPU use the arrays should fit in per-core cache, probably less than 2 megabytes on todays computers, likely much less, so I don't think the memory use is any kind of an issue. The one thread will probably also wait, idle, for most of the time. That should be ok. Contrary to tuxdev's claim, this is lockless (apart from some semaphores for the output thread), and quite easily implemented as an example program here. I can prove it by writing it as an example if desired (I'm pretty annoyed at tuxdev's claim), and if our members promise to not to submit it as their own homework. (I don't think trist007 would, but there may be some of er classmates or other students at the same level here.)

Edited to add: Heh, I just realized it is much simpler and probably a bit more efficient to let each thread handle a full chunk of the sieve alone.

tuxdev · 12-30-2011, 10:28 PM

Quote:

Contrary to tuxdev's claim, this is lockless (apart from some semaphores for the output thread), and quite easily implemented as an example program here. I can prove it by writing it as an example if desired (I'm pretty annoyed at tuxdev's claim), and if our members promise to not to submit it as their own homework.

Well, you've got expertise and experience.. that kinda disqualifies you from inclusion in "pretty much all". For a mere mortal like me who has only dabbled a bit for game dev, I'm definitely not confident in getting it right. It's complicated to think through properly and easy to screw up in horribly subtle ways.

trist007 · 01-01-2012, 02:39 AM

Okay, here's my final draft. One thing that I had issues with was handling the case where a thread finds a prime smaller than the largest prime already found, even if primes_to_find have already been found. So, I had to write a few conditions in my thread process, crunch(), which makes my code less efficient.

Code:

#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>

// global functions
void *crunch();
long long getNumber();
int isPrimeLongLong(long long n);
void bubbleSort(long long *);

// global variables
long long n, *archive;
int primes_to_find, threads, primes_found = 0;
pthread_t *tid;
pthread_mutex_t lock1 = PTHREAD_MUTEX_INITIALIZER;

int main(int argc, char *argv[])
{
   // local variables
   long i = 0;

   if(argc != 4)
   {
      fprintf(stderr, "Usage: %s <long long number> <# of primes> <# of threads>\n", argv[0]);
      return 1;
   }

   // print introduction and arguments
   fprintf(stderr, "Primecruncher by\ntrist007\n");

   // long long starting number
   n = atoll(argv[1]);

   // number of primes to find
   primes_to_find = atoi(argv[2]);

   // global archive of primes found
   archive = malloc(primes_to_find * sizeof(long long));

   // number of threads to start
   threads = atoi(argv[3]);

   // create calculation threads
   tid = malloc(threads * sizeof(pthread_t));

   for(i = 0; i < threads; i++)
   {
      if(pthread_create(&tid[i], NULL, crunch, NULL))
         return 1;
   }

   // wait for crunch threads to exit
   for(i = 0; i < threads; i++)
   {
      pthread_join(tid[i], NULL);
   }

   // sort the archive
   bubbleSort(archive);

   // view the archive
   for(i = 0; i < primes_to_find; i++)
   {
      printf("%llu\n", archive[i]);
   }

   // free memory
   free(archive);
   free(tid);

   // close program
   return 0;
}

void *crunch()
{
   long long number, temp, largest;
   int i, large_index;
   while(primes_found < primes_to_find)
   {
      number = getNumber();
      if(number != 0)
      {
         if(isPrimeLongLong(number))
         {
            pthread_mutex_lock(&lock1);
            if(primes_found == primes_to_find)
            {
               largest = archive[0];
               for(i = 0; i < primes_to_find; i++)
               {
                  if(archive[i] > largest)
                  {
                     largest = archive[i];
                     large_index = i;
                  }
               }
               if(number < largest)
               {
                  archive[large_index] = number;
               }
            }
            else
            {
               archive[primes_found] = number;
               primes_found++;
            }
            pthread_mutex_unlock(&lock1);
         }
      }
      else
      {
         pthread_exit(NULL);
      }
   }
   pthread_exit(NULL);
}

long long getNumber()
{
   pthread_mutex_lock(&lock1);
   long long ret = 0;
   if(primes_found < primes_to_find)
   {
      ret = n++;
   }
   pthread_mutex_unlock(&lock1);
   return ret;
}

void bubbleSort(long long *archive)  {
   long long tmp;
   int a, b;
   for(a = 1; a < primes_to_find; ++a)
      for(b = primes_to_find-1; b >= a; --b) {
         /* compare adjacent elements */
         if(archive[ b - 1] > archive[ b ]) {
            /* exchange elements */
            tmp = archive[ b - 1];
            archive[ b - 1] = archive[ b ];
            archive[ b ] = tmp;
         }
      }
}

And of course primelib.c

Code:

#include <math.h>

int isPrimeInt(int n)  {
   int k;
   for(k = 2; k < sqrt(n); k++)  {
      if ((n % k) == 0)
         return 1;
   }
   return 0;
}

int isPrimeLongLong(long long n)  {
    long long k;
   for(k = 2; k < sqrt(n); k++)  {
      if ((n % k) == 0)
         return 0;
   }
   return 1;
}

Here are my test runs to find 20 primes at least as great as 426182928722265301 using 1, 4, and 8 threads respectively.
rgonzale@ariel:~1027> time ./prime 426182928722265301 20 1
Primecruncher by
trist007
426182928722265313
426182928722265347
426182928722265413
426182928722265439
426182928722265479
426182928722265487
426182928722265499
426182928722265541
426182928722265551
426182928722265619
426182928722265629
426182928722265641
426182928722265653
426182928722265697
426182928722265719
426182928722265797
426182928722265853
426182928722265857
426182928722265937
426182928722266019

real 6m54.274s
user 6m54.061s
sys 0m0.001s

rgonzale@ariel:~1028> time ./prime 426182928722265301 20 4
Primecruncher by
trist007
426182928722265313
426182928722265347
426182928722265413
426182928722265439
426182928722265479
426182928722265487
426182928722265499
426182928722265541
426182928722265551
426182928722265619
426182928722265629
426182928722265641
426182928722265653
426182928722265697
426182928722265719
426182928722265797
426182928722265853
426182928722265857
426182928722265937
426182928722266019

real 2m5.881s
user 7m57.657s
sys 0m0.001s

rgonzale@ariel:~1029> time ./prime 426182928722265301 20 8
Primecruncher by
trist007
426182928722265313
426182928722265347
426182928722265413
426182928722265439
426182928722265479
426182928722265487
426182928722265499
426182928722265541
426182928722265551
426182928722265619
426182928722265629
426182928722265641
426182928722265653
426182928722265697
426182928722265719
426182928722265797
426182928722265853
426182928722265857
426182928722265937
426182928722266019

real 1m56.047s
user 14m32.902s
sys 0m0.001s

I ran these tests on a 2011 quad-core I7 Mac Mini with Slackware 13.37 64-bit.

Code:

rgonzale@ariel:~503> cat /proc/cpuinfo
processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 42
model name	: Intel(R) Core(TM) i7-2635QM CPU @ 2.00GHz
stepping	: 7
cpu MHz		: 1995.335
cache size	: 6144 KB
physical id	: 0
siblings	: 8
core id		: 0
cpu cores	: 4
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm sse4_1 sse4_2 x2apic popcnt aes xsave avx lahf_lm ida arat xsaveopt pln pts dts tpr_shadow vnmi flexpriority ept vpid
bogomips	: 3990.67
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 1
vendor_id	: GenuineIntel
cpu family	: 6
model		: 42
model name	: Intel(R) Core(TM) i7-2635QM CPU @ 2.00GHz
stepping	: 7
cpu MHz		: 1995.335
cache size	: 6144 KB
physical id	: 0
siblings	: 8
core id		: 1
cpu cores	: 4
apicid		: 2
initial apicid	: 2
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm sse4_1 sse4_2 x2apic popcnt aes xsave avx lahf_lm ida arat xsaveopt pln pts dts tpr_shadow vnmi flexpriority ept vpid
bogomips	: 3990.43
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 2
vendor_id	: GenuineIntel
cpu family	: 6
model		: 42
model name	: Intel(R) Core(TM) i7-2635QM CPU @ 2.00GHz
stepping	: 7
cpu MHz		: 1995.335
cache size	: 6144 KB
physical id	: 0
siblings	: 8
core id		: 2
cpu cores	: 4
apicid		: 4
initial apicid	: 4
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm sse4_1 sse4_2 x2apic popcnt aes xsave avx lahf_lm ida arat xsaveopt pln pts dts tpr_shadow vnmi flexpriority ept vpid
bogomips	: 3990.43
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 3
vendor_id	: GenuineIntel
cpu family	: 6
model		: 42
model name	: Intel(R) Core(TM) i7-2635QM CPU @ 2.00GHz
stepping	: 7
cpu MHz		: 1995.335
cache size	: 6144 KB
physical id	: 0
siblings	: 8
core id		: 3
cpu cores	: 4
apicid		: 6
initial apicid	: 6
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm sse4_1 sse4_2 x2apic popcnt aes xsave avx lahf_lm ida arat xsaveopt pln pts dts tpr_shadow vnmi flexpriority ept vpid
bogomips	: 3990.44
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 4
vendor_id	: GenuineIntel
cpu family	: 6
model		: 42
model name	: Intel(R) Core(TM) i7-2635QM CPU @ 2.00GHz
stepping	: 7
cpu MHz		: 1995.335
cache size	: 6144 KB
physical id	: 0
siblings	: 8
core id		: 0
cpu cores	: 4
apicid		: 1
initial apicid	: 1
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm sse4_1 sse4_2 x2apic popcnt aes xsave avx lahf_lm ida arat xsaveopt pln pts dts tpr_shadow vnmi flexpriority ept vpid
bogomips	: 3990.43
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 5
vendor_id	: GenuineIntel
cpu family	: 6
model		: 42
model name	: Intel(R) Core(TM) i7-2635QM CPU @ 2.00GHz
stepping	: 7
cpu MHz		: 1995.335
cache size	: 6144 KB
physical id	: 0
siblings	: 8
core id		: 1
cpu cores	: 4
apicid		: 3
initial apicid	: 3
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm sse4_1 sse4_2 x2apic popcnt aes xsave avx lahf_lm ida arat xsaveopt pln pts dts tpr_shadow vnmi flexpriority ept vpid
bogomips	: 3990.43
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 6
vendor_id	: GenuineIntel
cpu family	: 6
model		: 42
model name	: Intel(R) Core(TM) i7-2635QM CPU @ 2.00GHz
stepping	: 7
cpu MHz		: 1995.335
cache size	: 6144 KB
physical id	: 0
siblings	: 8
core id		: 2
cpu cores	: 4
apicid		: 5
initial apicid	: 5
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm sse4_1 sse4_2 x2apic popcnt aes xsave avx lahf_lm ida arat xsaveopt pln pts dts tpr_shadow vnmi flexpriority ept vpid
bogomips	: 3990.43
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 7
vendor_id	: GenuineIntel
cpu family	: 6
model		: 42
model name	: Intel(R) Core(TM) i7-2635QM CPU @ 2.00GHz
stepping	: 7
cpu MHz		: 1995.335
cache size	: 6144 KB
physical id	: 0
siblings	: 8
core id		: 3
cpu cores	: 4
apicid		: 7
initial apicid	: 7
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm sse4_1 sse4_2 x2apic popcnt aes xsave avx lahf_lm ida arat xsaveopt pln pts dts tpr_shadow vnmi flexpriority ept vpid
bogomips	: 3990.43
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

Seems like 4 threads works best since bumping to 8 adds little marginal utility. Next I'm going to try replacing the threads with processes to see how it compares with threads in this situation. I would also like to try out those more efficient algorithms for calculating primes that Nominal_Animal had suggested. Then long term, I'd like to try out what skoona and Nominal_Animal had suggested. I still can't decide which method of using brackets I like better.

Code:

main
{
   stuff
}

or

Code:

main  {
   stuff
}

Using vim's "%" to check matching brackets works great. I think I'll go with the latter from now on, looks cleaner. Thanks for all the comments, it's all great stuff. Happy New Years!!