Create Weighted Usernames

This script uses the data files dist.male.first, dist.female.first, and dist.all.last available from the US Census Bureau to create weighted lists of usernames.

A zip file of all three files is available here: http://wirefall.com/census.zip

#!/usr/bin/perl
 #CreateUsernames.pl
 #Create weighted usernames from census data
 #Dustin K. Dykes
 #v1.1 20121107




use Getopt::Std;
 use feature qw/switch/; 




%options=();
 getopts("bc:f:hi:l:m:p:s:t:", \%options);




$bare = $options{b};
 $case = $options{c};
 $fname = $options{f};
 $help = $options{h};
 $initial = $options{i};
 $length = $options{l};
 $myname = $options{m};
 $percent = $options{p};
 $separator = $options{s};
 $top = $options{t};




if ($help){
 print "\nThis script creates weighted usernames from the census files \ndist.male.first, dist.female.first, and dist.all.last\n
  -h    Help
  -b   Bare - return just usernames - no statistics.
  -c (ulm) Case - upper, lower, or mixed.
   Default is lower.
  -f (fma) First name - male, female, or all.
   Default is all.
  -i (ba)  First initial instead of first name - before or after last name.
  -l (\#)   Length of username - most useful for initial+last name.
   No intelligence applied, just truncated.
  -m (uid) Return the position in the index of the provided username.
   The username must match the flags,
   i.e. \"-i b -m asmith\" vs \"-s . -m adam.smith\"
  -p (\#)   Percent of population covered (first and last names).
   Default is 50% and is capped at 90% (as is census data).
   All first names are used to create weighting for initials.
   50% produces 337,094 usernames
   60% produces 1,222,747 usernames
   70% produces 4,893,203 usernames
   75% produces 11,011,702 usernames (using 4.5GB of memory)
   Recommendation: Increase as high as possible and use -t (top)
   to limit the number of returned usernames.
  -s (.)   Separator - between first name or initial and last name.
   Default is none.
  -t (\#)   Top number usernames.
   Default is all, but also limited by -p (percentage).

Examples: CreateUsernames.pl -b -c m -i b -l 8 -s _ -t 6
    J_Johnson
    M_Smith
    J_Williams
    J_Jones
    J_Brown
    M_Johnson

   CreateUsernames.pl -s . -t 5
    3.290626        john.smith
    3.161858        robert.smith
    2.68758 james.johnson
    2.64951 john.johnson
    2.644774        michael.smith\n

   CreateUsernames.pl  -i b -m ddykes
    Username ddykes not found! Try increasing percentage (-p)

   CreateUsernames.pl -i b -p 70 -m ddykes
    27411    ddykes

   CreateUsernames.pl -s . -p 70 -m alex.smith
    5601     alex.smith\n";
 exit;
 }




if (!$fname){
  $fname = "a";
 }




if (!$percent){
  $percent = 50;
 }




if ($myname){
  $bare = 1;
 }




open(MALE, "< dist.male.first");
 open(FEMALE, "< dist.female.first");
 open(LAST, "< dist.all.last");




if (($fname eq "a" ) || ($fname eq "m")){
  if ($initial){
   while (<MALE>) {
    chomp;
    @msplit = split(/\s+/,$_);
    push (@male,"$msplit[0]\t$msplit[1]");
   }
  }
  else {
   while (<MALE>) {
    chomp;
    @msplit = split(/\s+/,$_);
     if ($msplit[2] <= $percent){
      push (@male,"$msplit[0]\t$msplit[1]");
      }
   }
  }
 }




if (($fname eq "a" ) || ($fname eq "f")){
  if ($initial){
   while (<FEMALE>) {
     chomp;
     @fsplit = split(/\s+/,$_);
     push (@female,"$fsplit[0]\t$fsplit[1]");
   }
  }
  else {
   while (<FEMALE>) {
    chomp;
    @fsplit = split(/\s+/,$_);
     if ($fsplit[2] <= $percent){
      push (@female,"$fsplit[0]\t$fsplit[1]");
      }
   }
  }
 }




@first = (@male, @female);




if ($initial) {
 ($a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p, $q, $r, $s, $t, $u, $v, $w, $x, $y, $z) = (0) x 26;
  foreach (@first){
   @isplit = split(/\t/,$_);
   $fi = substr($isplit[0], 0, 1);
   given($fi){
    when ("a") { $a = ($a + $isplit[1]); }
    when ("b") { $b = ($b + $isplit[1]); }
    when ("c") { $c = ($c + $isplit[1]); }
    when ("d") { $d = ($d + $isplit[1]); }
    when ("e") { $e = ($e + $isplit[1]); }
    when ("f") { $f = ($f + $isplit[1]); }
    when ("g") { $g = ($g + $isplit[1]); }
    when ("h") { $h = ($h + $isplit[1]); }
    when ("i") { $i = ($i + $isplit[1]); }
    when ("j") { $j = ($j + $isplit[1]); }
    when ("k") { $k = ($k + $isplit[1]); }
    when ("l") { $l = ($l + $isplit[1]); }
    when ("m") { $m = ($m + $isplit[1]); }
    when ("n") { $n = ($n + $isplit[1]); }
    when ("o") { $o = ($o + $isplit[1]); }
    when ("p") { $p = ($p + $isplit[1]); }
    when ("q") { $q = ($q + $isplit[1]); }
    when ("r") { $r = ($r + $isplit[1]); }
    when ("s") { $s = ($s + $isplit[1]); }
    when ("t") { $t = ($t + $isplit[1]); }
    when ("u") { $u = ($u + $isplit[1]); }
    when ("v") { $v = ($v + $isplit[1]); }
    when ("w") { $w = ($w + $isplit[1]); }
    when ("x") { $x = ($x + $isplit[1]); }
    when ("y") { $y = ($y + $isplit[1]); }
    when ("z") { $z = ($z + $isplit[1]); }
   }
  }
  @first = ();
   push (@first,"a\t$a");
   push (@first,"b\t$b");
   push (@first,"c\t$c");
   push (@first,"d\t$d");
   push (@first,"e\t$e");
   push (@first,"f\t$f");
   push (@first,"g\t$g");
   push (@first,"h\t$h");
   push (@first,"i\t$i");
   push (@first,"j\t$j");
   push (@first,"k\t$k");
   push (@first,"l\t$l");
   push (@first,"m\t$m");
   push (@first,"n\t$n");
   push (@first,"o\t$o");
   push (@first,"p\t$p");
   push (@first,"q\t$q");
   push (@first,"r\t$r");
   push (@first,"s\t$s");
   push (@first,"t\t$t");
   push (@first,"u\t$u");
   push (@first,"v\t$v");
   push (@first,"w\t$w");
   push (@first,"x\t$x");
   push (@first,"y\t$y");
   push (@first,"z\t$z");
 } 




while (<LAST>) {
  chomp;
  @lsplit = split(/\s+/,$_);
   if ($lsplit[2] <= $percent){
    push (@last,"$lsplit[0]\t$lsplit[1]");
    }
 }




foreach (@last) {
  @ulsplit = split(/\t/,$_);
  foreach (@first) {
   @ufsplit = split(/\t/,$_);
   $sum = $ufsplit[1] * $ulsplit[1];
   if ($sum != 0){
    if ($case eq "u"){
     $ufsplit[0] = uc($ufsplit[0]);
     $ulsplit[0] = uc($ulsplit[0]);
     if ($initial eq "a"){
      push (@uid,"$sum\t$ulsplit[0]$separator$ufsplit[0]");
      }
     else {
      push (@uid,"$sum\t$ufsplit[0]$separator$ulsplit[0]");
      }
     }
    elsif ($case eq "m") {
     $ufsplit[0] = ucfirst($ufsplit[0]);
     $ulsplit[0] = ucfirst($ulsplit[0]);
     if ($initial eq "a"){
      push (@uid,"$sum\t$ulsplit[0]$separator$ufsplit[0]");
      }
     else {
      push (@uid,"$sum\t$ufsplit[0]$separator$ulsplit[0]");
      }
     }
    else {
     if ($initial eq "a"){
      push (@uid,"$sum\t$ulsplit[0]$separator$ufsplit[0]");
      }
     else {
      push (@uid,"$sum\t$ufsplit[0]$separator$ulsplit[0]");
      }
     }
   }
  }
 }




@suid = sort {$b <=> $a} @uid;




if ($length){
  @temp = ();
  foreach (@suid){
   @lengthsplit = split(/\t/,$_);
   $short = substr($lengthsplit[1], 0, $length);
   push (@temp,"$lengthsplit[0]\t$short");
  }
  @suid = @temp;
 }




if ($bare){
  @temp = ();
  foreach (@suid){
   @baresplit = split(/\t/,$_);
   push (@temp,"$baresplit[1]");
  }
  @suid = @temp;
 }




if (!$myname){
  if ($top){
   for ($counter = 1; $counter <= $top; $counter++){
    print "$suid[$counter]\n";
   }
  }
  else{
   foreach (@suid) {
   print "$_\n";
   }
  }
 }
 else{
  %index;
  @index{@suid} = (0..$#suid);
  $index = $index{$myname};
   if($index){
    print "$index\t $myname\n";
   }
   else{
    print "Username $myname not found! Try increasing percentage (-p)\n";
   }
 }

 

Leave a Reply

Your email address will not be published. Required fields are marked *