DEV Community

Cover image for Set Data Structure in C
João Godinho
João Godinho

Posted on

Set Data Structure in C

  • In this article I will show how to implement a Set data structure in C using a hashtable, and discuss complexity, trade-offs, and possible improvements.

Prerequisites:

  • Basic knowledge of programming (logic, etc);
  • C syntax, allocating variables;
  • Memory management in C: pointers, malloc, free;
  • Basic understanding of hashing;

What is a Set?

  • A data structure that stores unique elements.
  • Same idea as in set theory in mathematics: no repeated values are allowed.

Why use a Hashtable?

  • A hashtable is a good choice for implementing a Set:
    • insert, find, remove -> O(1) average
  • Collisions can happen, so we need a strategy to handle them.

Collision Handling (Chaining)

  • In this implementation, collisions are handled using a linked list per bucket.
  • Multiple elements that hash to the same index are stored in the same list.

Complexity

  • Let:

    • m = number of buckets
    • k = elements in one bucket
    • n = total elements
  • Worst case:

    • insert O(k)
    • find O(k)
    • remove O(k)
    • iterate O(m + n)
    • isEmpty O(1)
  • Average case:

    • insert O(1)
    • find O(1)
    • remove O(1)
    • iterate O(m + n) -> O(n) if m is constant
    • isEmpty O(1)

Trade-offs and Improvements

  • Load factor not handled:

    • I haven't handled load factor by rehashing since the focus is the Set itself.
    • This can degrade performance as n grows.
    • Improvement: dynamic resizing (grow/shrink).
  • Hashtable vs Balanced Binary Search Tree (BST):

    • A hashtable is generally better for unordered sets, because it provides constant-time average performance and does not maintain order.
    • A balanced BST is preferable only when worst-case guarantees or sorted order are more important than average performance.
    • A set is an unordered data structure, so using a hashtable is preferable.
  • Time complexity comparison (insert, find, remove):

    • Hashtable:
    • Average case: O(1)
    • Worst case: O(n)
    • Balanced BST:
    • Average case: O(log n)
    • Worst case: O(log n)
  • To check load factor handling and hashtable growth using open addressing with double hashing:

    https://github.com/godinhojoao/dsa-studies/blob/main/dsa-in-c/hashtable.c

How to Develop a Set in C

  • In this code:
    • A hashtable with fixed size is used
    • Collisions are handled with linked lists (chaining)
    • FNV-1a is used as the hash function
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define SET_LIMIT_SIZE 1000

typedef struct SetNode {
  struct SetNode* next;
  int value;
} SetNode;

typedef struct Set {
  SetNode** nodes;
  int currLength;
} Set;

// 32-bit FNV-1a
unsigned int fnv1a_int(int value) {
  unsigned int hash = 2166136261u;
  unsigned char* p = (unsigned char*)&value;

  for(int i = 0; i < (int)sizeof(int); i++) {
    hash ^= (unsigned int)p[i];
    hash *= 16777619u;
  }

  return hash;
}

int itemIndex(int value, int arrLimit) {
  unsigned int hash = fnv1a_int(value);
  return hash % arrLimit;
}

Set* initializeSet() {
  Set* set = malloc(sizeof(Set));
  set->currLength = 0;
  set->nodes = malloc(sizeof(SetNode*) * SET_LIMIT_SIZE);

  for(int i = 0; i < SET_LIMIT_SIZE; i++) {
    set->nodes[i] = NULL;
  }

  return set;
}

SetNode* createNode(int value) {
  SetNode* newNode = malloc(sizeof(SetNode));
  newNode->next = NULL;
  newNode->value = value;
  return newNode;
}

void insert(Set* set, int value) {
  int index = itemIndex(value, SET_LIMIT_SIZE);
  SetNode* currentNode = set->nodes[index];

  SetNode* prevNode = NULL;
  while(currentNode != NULL) {
    if(currentNode->value == value) {
      printf("set do not allow repeated values: %d\n", value);
      return;
    }
    prevNode = currentNode;
    currentNode = currentNode->next;
  }

  SetNode* newNode = createNode(value);
  set->currLength += 1;

  if(prevNode == NULL) {
    set->nodes[index] = newNode;
  } else {
    prevNode->next = newNode;
  }
}

int isEmpty(Set* set) {
  return set->currLength == 0;
}

SetNode* find(Set* set, int value) {
  if(isEmpty(set))
    return NULL;

  int index = itemIndex(value, SET_LIMIT_SIZE);
  SetNode* currNode = set->nodes[index];

  while(currNode != NULL && currNode->value != value) {
    currNode = currNode->next;
  }

  return currNode;
}

int removeItem(Set* set, int value) {
  if(isEmpty(set))
    return -1;

  int index = itemIndex(value, SET_LIMIT_SIZE);
  SetNode* currentNodeToDelete = set->nodes[index];

  SetNode* prevNode = NULL;
  while(currentNodeToDelete != NULL && currentNodeToDelete->value != value) {
    prevNode = currentNodeToDelete;
    currentNodeToDelete = currentNodeToDelete->next;
  }

  if(currentNodeToDelete == NULL)
    return -1;

  if(prevNode != NULL) {
    prevNode->next = currentNodeToDelete->next;
  } else {
    set->nodes[index] = currentNodeToDelete->next;
  }

  free(currentNodeToDelete);
  set->currLength -= 1;

  return 1;
}

int main() {
  Set* set = initializeSet();

  printf("== insert 10, 20, 30, 10 ==\n");
  insert(set, 10);
  insert(set, 20);
  insert(set, 30);
  insert(set, 10);  // duplicate test

  printf("== find ==\n");
  printf("find 10: %s\n", find(set, 10) ? "found" : "not found");
  printf("find 99: %s\n", find(set, 99) ? "found" : "not found");

  printf("== remove ==\n");
  printf("remove 99: %d\n", removeItem(set, 99));
  printf("remove 20: %d\n", removeItem(set, 20));
  printf("remove 10: %d\n", removeItem(set, 10));
  printf("remove 10 again: %d\n", removeItem(set, 10));

  printf("== final finds ==\n");
  printf("find 10: %s\n", find(set, 10) ? "found" : "not found");
  printf("find 20: %s\n", find(set, 20) ? "found" : "not found");
  printf("find 30: %s\n", find(set, 30) ? "found" : "not found");

  return 0;
}
Enter fullscreen mode Exit fullscreen mode

References

Top comments (0)