import csv
import codecs
from itertools import islice
with codecs.open('ZTransPropertyInfo.txt', 'r', encoding='utf-8', errors='ignore') as zd, open('outfile.txt', 'w') as w:
for line in zd:
if 'MARICOPA' in line:
w.write(line)
with codecs.open('ZTransMain.txt', 'r', encoding='utf-8', errors='ignore') as zd, open('outfile2.txt', 'w') as w:
for line in zd:
if 'MARICOPA' in line:
w.write(line)
import pandas as pd
# reading two csv files
data1 = pd.read_csv('outfile.txt', delimiter = '|')
data2 = pd.read_csv('outfile2.txt', delimiter = '|')
/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3165: DtypeWarning: Columns (2,4,7,8,11,12,13,19,20,21,24,26,30,31,32,33,34,36,37,42,43,45,47,49,50,51,54,59,60,61) have mixed types.Specify dtype option on import or set low_memory=False.
has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3165: DtypeWarning: Columns (5,7,8,9,10,17,22,30,31,32,33,34,35,37,39,40,50,54,55,56,58,60,63,67,71,74,75,82,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,100,103,105,106,107,109,111,112,113,114,116,117,119) have mixed types.Specify dtype option on import or set low_memory=False.
has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
len(data1)
142869
len(data2)
9318789
coloutfile = pd.read_csv('colnames_outfile.csv')
coloutfile2 = pd.read_csv('colnames_outfile2.csv')
coloutfile2
| Unnamed: 0 | x | |
|---|---|---|
| 0 | 1 | TransId |
| 1 | 2 | FIPS |
| 2 | 3 | State |
| 3 | 4 | County |
| 4 | 5 | DataClassStndCode |
| … | … | … |
| 126 | 127 | TransactionTypeStndCode |
| 127 | 128 | BatchID |
| 128 | 129 | BKFSPID |
| 129 | 130 | ZVendorStndCode |
| 130 | 131 | SourceChkSum |
131 rows × 2 columns
print(coloutfile.columns)
Index(['Unnamed: 0', 'x'], dtype='object')
cname1 = coloutfile['x'].values
cname2 = coloutfile2['x'].values
cname1
array(['TransId', 'AssessorParcelNumber', 'APNIndicatorStndCode',
'TaxIDNumber', 'TaxIDIndicatorStndCode',
'UnformattedAssessorParcelNumber', 'AlternateParcelNumber',
'HawaiiCondoCPRCode', 'PropertyHouseNumber',
'PropertyHouseNumberExt', 'PropertyStreetPreDirectional',
'PropertyStreetName', 'PropertyStreetSuffix',
'PropertyStreetPostDirectional', 'PropertyBuildingNumber',
'PropertyFullStreetAddress', 'PropertyCity', 'PropertyState',
'PropertyZip', 'PropertyZip4', 'OriginalPropertyFullStreetAddress',
'OriginalPropertyAddressLastline', 'PropertyAddressStndCode',
'LegalLot', 'LegalOtherLot', 'LegalLotCode', 'LegalBlock',
'LegalSubdivisionName', 'LegalCondoProjectPUDDevName',
'LegalBuildingNumber', 'LegalUnit', 'LegalSection', 'LegalPhase',
'LegalTract', 'LegalDistrict', 'LegalMunicipality', 'LegalCity',
'LegalTownship', 'LegalSTRSection', 'LegalSTRTownship',
'LegalSTRRange', 'LegalSTRMeridian', 'LegalSecTwnRngMer',
'LegalRecordersMapReference', 'LegalDescription', 'LegalLotSize',
'PropertySequenceNumber', 'PropertyAddressMatchcode',
'PropertyAddressUnitDesignator', 'PropertyAddressUnitNumber',
'PropertyAddressCarrierRoute', 'PropertyAddressGeoCodeMatchCode',
'PropertyAddressLatitude', 'PropertyAddressLongitude',
'PropertyAddressCensusTractAndBlock',
'PropertyAddressConfidenceScore', 'PropertyAddressCBSACode',
'PropertyAddressCBSADivisionCode', 'PropertyAddressMatchType',
'PropertyAddressDPV', 'PropertyGeocodeQualityCode',
'PropertyAddressQualityCode', 'FIPS', 'LoadID', 'ImportParcelID',
'BKFSPID', 'AssessmentRecordMatchFlag', 'BatchID'], dtype=object)
data1.columns = cname1
data1.head
<bound method NDFrame.head of TransId AssessorParcelNumber APNIndicatorStndCode TaxIDNumber \
0 31447596 104-03-029 NaN NaN
1 31450688 104-07-002A NaN NaN
2 31451247 104-41-002B NaN NaN
3 31455254 104-07-004K NaN NaN
4 31455269 104-07-004K NaN NaN
... ... ... ... ...
142864 443923230 306-08-004 NaN NaN
142865 444307662 107-14-046B NaN NaN
142866 447601053 306-08-042 NaN
142867 44476352 NaN NaN NaN
142868 44562101 NaN NaN NaN
TaxIDIndicatorStndCode UnformattedAssessorParcelNumber \
0 NaN 10403029
1 NaN 10407002A
2 NaN 10441002B
3 NaN 10407004K
4 NaN 10407004K
... ... ...
142864 NaN 30608004
142865 NaN 10714046B
142866 30608042
142867 NaN NaN
142868 NaN NaN
AlternateParcelNumber HawaiiCondoCPRCode PropertyHouseNumber \
0 NaN NaN NaN
1 NaN NaN NaN
2 NaN NaN NaN
3 NaN NaN NaN
4 NaN NaN NaN
... ... ... ...
142864 NaN 2100
142865 NaN NaN NaN
142866 NaN NaN
142867 NaN NaN NaN
142868 NaN NaN
PropertyHouseNumberExt ... PropertyAddressMatchType \
0 NaN ... NaN
1 NaN ... NaN
2 NaN ... NaN
3 NaN ... NaN
4 NaN ... NaN
... ... ... ...
142864 NaN ... NaN
142865 NaN ... NaN
142866 NaN ... NaN
142867 NaN ... NaN
142868 NaN ... NaN
PropertyAddressDPV PropertyGeocodeQualityCode \
0 NaN NaN
1 NaN NaN
2 NaN NaN
3 NaN NaN
4 NaN NaN
... ... ...
142864 NaN NaN
142865 NaN NaN
142866 NaN NaN
142867 NaN NaN
142868 NaN NaN
PropertyAddressQualityCode FIPS LoadID ImportParcelID \
0 NaN 4001 4.176882e+09 3471129.0
1 NaN 4001 4.176880e+09 3471280.0
2 NaN 4001 4.176885e+09 160724094.0
3 NaN 4001 4.288529e+09 3471293.0
4 NaN 4001 4.288529e+09 3471293.0
... ... ... ... ...
142864 NaN 4025 4.227734e+09 6467503.0
142865 NaN 4025 4.245149e+09 6420274.0
142866 NaN 4025 4.325089e+09 6467541.0
142867 NaN 4027 4.251788e+09 NaN
142868 NaN 4027 4.251693e+09 NaN
BKFSPID AssessmentRecordMatchFlag BatchID
0 26719 1 4381728
1 30505 1 4381728
2 31294 1 4381728
3 36088 1 4585783
4 36089 1 4585783
... ... ... ...
142864 146591 1 4480589
142865 581706 1 4510412
142866 19257490 1 4671832
142867 158159 0 4537074
142868 50303 0 4537078
[142869 rows x 68 columns]>
data2.columns = cname2
data2.head
<bound method NDFrame.head of TransId FIPS State County DataClassStndCode RecordTypeStndCode \
0 31466602 4001 AZ APACHE D P
1 31490181 4001 AZ APACHE M P
2 31490639 4001 AZ APACHE M P
3 31494701 4001 AZ APACHE M P
4 31614926 4003 AZ COCHISE M P
... ... ... ... ... ... ...
9318784 44250889 4025 AZ YAVAPAI M P
9318785 44257768 4025 AZ YAVAPAI M P
9318786 44261774 4025 AZ YAVAPAI M P
9318787 44291305 4025 AZ YAVAPAI M P
9318788 44568103 4027 AZ YUMA M P
RecordingDate RecordingDocumentNumber RecordingBookNumber \
0 2007-11-05 2007-010777 NaN
1 2005-07-29 2005-007363 NaN
2 2006-01-30 2006-001267 NaN
3 2012-11-01 2012-005919 NaN
4 2009-07-10 2009-16844 NaN
... ... ... ...
9318784 2007-05-03 4133816 4503
9318785 2007-11-06 4188215 4552
9318786 2008-03-27 4222598 4584
9318787 2013-04-12 2013-0021421 4950
9318788 2012-10-29 2012-27912 NaN
RecordingPageNumber ... MatchStndCode REOStndCode \
0 NaN ... NM NaN
1 NaN ... NM NaN
2 NaN ... MA NaN
3 NaN ... NM NaN
4 NaN ... MA NaN
... ... ... ... ...
9318784 811 ... MA NaN
9318785 855 ... NM NaN
9318786 295 ... NM NaN
9318787 159 ... MA NaN
9318788 NaN ... MA NaN
UpdateOwnershipFlag LoadID StatusInd TransactionTypeStndCode \
0 4176950433 NaN
1 4176785559 NaN
2 4176784953 NaN
3 4176785341 NaN
4 4240489498 NaN
... ... ... ... ...
9318784 3873932235 NaN
9318785 3873963458 NaN
9318786 3873963871 NaN
9318787 3874013809 NaN
9318788 4251574354 NaN
BatchID BKFSPID ZVendorStndCode SourceChkSum
0 4381728 24330 BKF -8.397281e+08
1 4381724 3768 BKF 1.084165e+09
2 4381724 4234 BKF 1.789306e+08
3 4381724 2182 BKF -1.926040e+09
4 4503023 23484 BKF 1.796548e+09
... ... ... ... ...
9318784 3819760 73366 BKF 5.891026e+07
9318785 3819760 80246 BKF 1.754581e+07
9318786 3819760 84271 BKF 9.791578e+07
9318787 3819760 114215 BKF -9.256134e+08
9318788 4537078 56570 BKF -8.043522e+08
[9318789 rows x 131 columns]>
# using merge function by setting how='inner'
output1 = pd.merge(data1, data2,
on='TransId',
how='inner')
output1.to_csv('merge1.csv')
databuyer = pd.read_csv('ZTransBuyerName.txt', sep = '|', header=None, error_bad_lines=False)
dataseller = pd.read_csv('ZTransSellerName.txt', sep = '|', header=None, error_bad_lines=False)
b'Skipping line 2083791: expected 10 fields, saw 13\n'
dataseller
| TransId | Name | N2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 31996253 | DONALD S | MERRELL | DONALD S MERRELL | NaN | 1 | NaN | 3.847747e+09 | 4011 | 1000485622 |
| 1 | 31996254 | DONALD S | MERRELL | DONALD S MERRELL | NaN | 1 | NaN | 3.847747e+09 | 4011 | 1000485622 |
| 2 | 31996255 | DONALD S | MERRELL | DONALD S MERRELL | NaN | 1 | NaN | 3.847747e+09 | 4011 | 1000485622 |
| 3 | 31996256 | DONALD S | MERRELL | DONALD S MERRELL | NaN | 1 | NaN | 3.847747e+09 | 4011 | 1000485622 |
| 4 | 31996257 | KATHY ANN | CHANDLER | KATHY ANN CHANDLER | NaN | 1 | NaN | 3.847747e+09 | 4011 | 1000485622 |
| … | … | … | … | … | … | … | … | … | … | … |
| 14804494 | 448919808 | JESUS MANUEL | GOMEZ | JESUS MANUEL GOMEZ | NaN | 1 | NaN | NaN | 4023 | 1000505493 |
| 14804495 | 448919808 | ANA CECILIA | GOMEZ | ANA CECILIA GOMEZ | NaN | 2 | NaN | NaN | 4023 | 1000505493 |
| 14804496 | 448919808 | JESUS M | GOMEZ | JESUS M GOMEZ | NaN | 3 | NaN | NaN | 4023 | 1000505493 |
| 14804497 | 448919808 | ANA C | GOMEZ | ANA C GOMEZ | NaN | 4 | NaN | NaN | 4023 | 1000505493 |
| 14804498 | 448919809 | JOSE A | CEDENO | JOSE A CEDENO | NaN | 1 | NaN | NaN | 4023 | 1000505493 |
14804499 rows × 10 columns
databuyer.columns = ['TransId', 'Name', 'N2', '3', '4', '5', '6', '7', '8', '9']
dataseller.columns = ['TransId', 'Name', 'N2', '3', '4', '5', '6', '7', '8', '9']
# using merge function by setting how='inner'
output2 = pd.merge(output1, databuyer,
on='TransId',
how='inner')
output2.to_csv('merge_buyer.csv')
# using merge function by setting how='inner'
output3 = pd.merge(output1, dataseller,
on='TransId',
how='inner')
output3.to_csv('merge_seller.csv')